1
0
This commit is contained in:
Nathan TeBlunthuis 2025-01-11 22:46:43 -08:00
parent 4c2ddc7455
commit 9c6d7429b2

View File

@ -87,9 +87,15 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subre
subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df()
if static_tfidf_path is not None:
q = f"SELECT COUNT(DISTINCT({term_colname + '_id'})) as nterms FROM read_parquet('{static_tfidf_path}/*.parquet') WHERE count >= {min_df}"
q = f"SELECT COUNT(DISTINCT({term_colname + '_id'})) as nterms FROM read_parquet('{static_tfidf_path}/*.parquet')"
if min_df is not None and max_df is not None:
q = q + f" WHERE count >= {min_df} AND count <= {max_df}"
else:
if min_df is not None:
q = q + f" WHERE count >= {min_df}"
if max_df is not None:
q = q + f"AND count <= {max_df}"
q = q + f" WHERE count <= {max_df}"
nterms = conn.execute(q).df()
else:
nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df()