diff --git a/similarities/weekly_cosine_similarities.py b/similarities/weekly_cosine_similarities.py index 235b4cf..1476b77 100755 --- a/similarities/weekly_cosine_similarities.py +++ b/similarities/weekly_cosine_similarities.py @@ -40,7 +40,7 @@ def _week_similarities(week, simfunc, tfidf_path, term_colname, included_subredd term_colname=term_colname, included_subreddits=included_subreddits, topN=topN, - week=week.isoformat(), + week=week, rescale_idf=False) tfidf_colname='tf_idf' @@ -78,7 +78,7 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subre print(outfile) # do this step in parallel if we have the memory for it. # should be doable with pool.map - conn = duckdb.connect() + conn = duckdb.connect() subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df() nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df()