no longer need to convert from spark dates into isoformat.
This commit is contained in:
parent
104b708ff6
commit
02ec11f726
@ -40,7 +40,7 @@ def _week_similarities(week, simfunc, tfidf_path, term_colname, included_subredd
|
||||
term_colname=term_colname,
|
||||
included_subreddits=included_subreddits,
|
||||
topN=topN,
|
||||
week=week.isoformat(),
|
||||
week=week,
|
||||
rescale_idf=False)
|
||||
|
||||
tfidf_colname='tf_idf'
|
||||
@ -78,7 +78,7 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subre
|
||||
print(outfile)
|
||||
# do this step in parallel if we have the memory for it.
|
||||
# should be doable with pool.map
|
||||
conn = duckdb.connect()
|
||||
conn = duckdb.connect()
|
||||
subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df()
|
||||
|
||||
nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df()
|
||||
|
Loading…
Reference in New Issue
Block a user