bugfix in weekly similarities
This commit is contained in:
parent
ac06a8757a
commit
003a48aea5
@ -8,7 +8,7 @@ import fire
|
|||||||
from itertools import islice
|
from itertools import islice
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from similarities_helper import *
|
from similarities_helper import *
|
||||||
from multiprocessing import pool
|
from multiprocessing import Pool, cpu_count
|
||||||
|
|
||||||
def _week_similarities(tempdir, term_colname, week):
|
def _week_similarities(tempdir, term_colname, week):
|
||||||
print(f"loading matrix: {week}")
|
print(f"loading matrix: {week}")
|
||||||
@ -40,7 +40,7 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None,
|
|||||||
print(f"computing weekly similarities for {len(included_subreddits)} subreddits")
|
print(f"computing weekly similarities for {len(included_subreddits)} subreddits")
|
||||||
|
|
||||||
print("creating temporary parquet with matrix indicies")
|
print("creating temporary parquet with matrix indicies")
|
||||||
tempdir = prep_tfidf_entries_weekly(tfidf, term_colname, min_df, included_subreddits)
|
tempdir = prep_tfidf_entries_weekly(tfidf, term_colname, min_df, max_df=None, included_subreddits=included_subreddits)
|
||||||
|
|
||||||
tfidf = spark.read.parquet(tempdir.name)
|
tfidf = spark.read.parquet(tempdir.name)
|
||||||
|
|
||||||
@ -57,11 +57,11 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None,
|
|||||||
def week_similarities_helper(week):
|
def week_similarities_helper(week):
|
||||||
_week_similarities(tempdir, term_colname, week)
|
_week_similarities(tempdir, term_colname, week)
|
||||||
|
|
||||||
with Pool(40) as pool: # maybe it can be done with 40 cores on the huge machine?
|
with Pool(cpu_count()) as pool: # maybe it can be done with 40 cores on the huge machine?
|
||||||
list(pool.map(weeks,week_similarities_helper))
|
list(pool.map(week_similarities_helper,weeks))
|
||||||
|
|
||||||
def author_cosine_similarities_weekly(outfile, min_df=2 , included_subreddits=None, topN=500):
|
def author_cosine_similarities_weekly(outfile, min_df=2 , included_subreddits=None, topN=500):
|
||||||
return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_100k.parquet',
|
return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
|
||||||
outfile,
|
outfile,
|
||||||
'author',
|
'author',
|
||||||
min_df,
|
min_df,
|
||||||
@ -69,7 +69,7 @@ def author_cosine_similarities_weekly(outfile, min_df=2 , included_subreddits=No
|
|||||||
topN)
|
topN)
|
||||||
|
|
||||||
def term_cosine_similarities_weekly(outfile, min_df=None, included_subreddits=None, topN=500):
|
def term_cosine_similarities_weekly(outfile, min_df=None, included_subreddits=None, topN=500):
|
||||||
return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms_100k.parquet',
|
return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
|
||||||
outfile,
|
outfile,
|
||||||
'term',
|
'term',
|
||||||
min_df,
|
min_df,
|
||||||
|
Loading…
Reference in New Issue
Block a user