no longer need to convert from spark dates into isoformat.
This commit is contained in:
parent
104b708ff6
commit
02ec11f726
@ -40,7 +40,7 @@ def _week_similarities(week, simfunc, tfidf_path, term_colname, included_subredd
|
|||||||
term_colname=term_colname,
|
term_colname=term_colname,
|
||||||
included_subreddits=included_subreddits,
|
included_subreddits=included_subreddits,
|
||||||
topN=topN,
|
topN=topN,
|
||||||
week=week.isoformat(),
|
week=week,
|
||||||
rescale_idf=False)
|
rescale_idf=False)
|
||||||
|
|
||||||
tfidf_colname='tf_idf'
|
tfidf_colname='tf_idf'
|
||||||
@ -78,7 +78,7 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subre
|
|||||||
print(outfile)
|
print(outfile)
|
||||||
# do this step in parallel if we have the memory for it.
|
# do this step in parallel if we have the memory for it.
|
||||||
# should be doable with pool.map
|
# should be doable with pool.map
|
||||||
conn = duckdb.connect()
|
conn = duckdb.connect()
|
||||||
subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df()
|
subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df()
|
||||||
|
|
||||||
nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df()
|
nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df()
|
||||||
|
Loading…
Reference in New Issue
Block a user