1
0

no longer need to convert from spark dates into isoformat.

This commit is contained in:
Nathan TeBlunthuis 2024-12-28 13:55:54 -08:00
parent 104b708ff6
commit 02ec11f726

View File

@ -40,7 +40,7 @@ def _week_similarities(week, simfunc, tfidf_path, term_colname, included_subredd
term_colname=term_colname, term_colname=term_colname,
included_subreddits=included_subreddits, included_subreddits=included_subreddits,
topN=topN, topN=topN,
week=week.isoformat(), week=week,
rescale_idf=False) rescale_idf=False)
tfidf_colname='tf_idf' tfidf_colname='tf_idf'
@ -78,7 +78,7 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subre
print(outfile) print(outfile)
# do this step in parallel if we have the memory for it. # do this step in parallel if we have the memory for it.
# should be doable with pool.map # should be doable with pool.map
conn = duckdb.connect() conn = duckdb.connect()
subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df() subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df()
nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df() nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df()