1
0

enforce min_df constraint in counting lsi features.

This commit is contained in:
Nathan TeBlunthuis 2024-12-30 16:17:31 -08:00
parent 3555542862
commit 79d1826ba4

View File

@ -75,7 +75,7 @@ def cosine_similarities_weekly_lsi(*args, n_components=100, lsi_model=None, **kw
return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs) return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs)
#tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf_weekly/comment_submission_terms_tfidf.parquet') #tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf_weekly/comment_submission_terms_tfidf.parquet')
def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=None, max_df=None, static_tfidf_path=None): def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=0, max_df=None, static_tfidf_path=None):
print(outfile) print(outfile)
# do this step in parallel if we have the memory for it. # do this step in parallel if we have the memory for it.
# should be doable with pool.map # should be doable with pool.map
@ -83,7 +83,11 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subre
subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df() subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df()
if static_tfidf_path is not None: if static_tfidf_path is not None:
nterms = conn.execute(f"SELECT COUNT(DISTINCT({term_colname + '_id'})) as nterms FROM read_parquet('{static_tfidf_path}/*.parquet') WHERE count >= {min_df} AND count <={max_df}").df() q = f"SELECT COUNT(DISTINCT({term_colname + '_id'})) as nterms FROM read_parquet('{static_tfidf_path}/*.parquet') WHERE count >= {min_df}"
if max_df is not None:
q = q + f"AND count <= {max_df}"
nterms = conn.execute(q).df()
else: else:
nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df() nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df()
nterms = nterms.nterms.values nterms = nterms.nterms.values
@ -111,7 +115,8 @@ def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/
max_df, max_df,
included_subreddits, included_subreddits,
topN, topN,
min_df=2, min_df=min_df,
max_df=max_df,
static_tfidf_path=static_tfidf_path static_tfidf_path=static_tfidf_path
) )
@ -126,14 +131,15 @@ def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/re
static_tfidf_path=static_tfidf_path) static_tfidf_path=static_tfidf_path)
def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None): def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None, min_df=2):
return cosine_similarities_weekly_lsi(infile, return cosine_similarities_weekly_lsi(infile,
outfile, outfile,
'author', 'author',
included_subreddits=included_subreddits, included_subreddits=included_subreddits,
n_components=n_components, n_components=n_components,
lsi_model=lsi_model, lsi_model=lsi_model,
static_tfidf_path=static_tfidf_path static_tfidf_path=static_tfidf_path,
min_df=min_df
) )