From 9c6d7429b291c7497ef5e812476a8758586071d6 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Sat, 11 Jan 2025 22:46:43 -0800 Subject: [PATCH] fix bug. --- similarities/weekly_cosine_similarities.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/similarities/weekly_cosine_similarities.py b/similarities/weekly_cosine_similarities.py index f869fbf..be5531f 100755 --- a/similarities/weekly_cosine_similarities.py +++ b/similarities/weekly_cosine_similarities.py @@ -87,9 +87,15 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subre subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df() if static_tfidf_path is not None: - q = f"SELECT COUNT(DISTINCT({term_colname + '_id'})) as nterms FROM read_parquet('{static_tfidf_path}/*.parquet') WHERE count >= {min_df}" - if max_df is not None: - q = q + f"AND count <= {max_df}" + q = f"SELECT COUNT(DISTINCT({term_colname + '_id'})) as nterms FROM read_parquet('{static_tfidf_path}/*.parquet')" + if min_df is not None and max_df is not None: + q = q + f" WHERE count >= {min_df} AND count <= {max_df}" + else: + if min_df is not None: + q = q + f" WHERE count >= {min_df}" + if max_df is not None: + q = q + f" WHERE count <= {max_df}" + nterms = conn.execute(q).df() else: nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df()