diff --git a/similarities/weekly_cosine_similarities.py b/similarities/weekly_cosine_similarities.py index a8f39d7..6ad5b8e 100755 --- a/similarities/weekly_cosine_similarities.py +++ b/similarities/weekly_cosine_similarities.py @@ -44,11 +44,12 @@ def _week_similarities(week, simfunc, tfidf_path, term_colname, included_subredd week=week, rescale_idf=False) + tfidf_colname='tf_idf' if term_ids is not None: entries = duckdb.sql(f"SELECT A.{tfidf_colname}, B.{term_id} AS {term_id_new}, A.subreddit_id_new FROM entries AS A JOIN term_ids AS B ON A.{term_id_new} == B.{term_id_old}").df() - tfidf_colname='tf_idf' + # if the max subreddit id we found is less than the number of subreddit names then we have to fill in 0s shape = (nterms,subreddit_names.shape[0]) print(shape)