From 79d1826ba4893391b2cf24b0bcc4a4c363185f9f Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Mon, 30 Dec 2024 16:17:31 -0800 Subject: [PATCH] enforce min_df constraint in counting lsi features. --- similarities/weekly_cosine_similarities.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/similarities/weekly_cosine_similarities.py b/similarities/weekly_cosine_similarities.py index 47134c8..620ed37 100755 --- a/similarities/weekly_cosine_similarities.py +++ b/similarities/weekly_cosine_similarities.py @@ -75,7 +75,7 @@ def cosine_similarities_weekly_lsi(*args, n_components=100, lsi_model=None, **kw return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs) #tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf_weekly/comment_submission_terms_tfidf.parquet') -def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=None, max_df=None, static_tfidf_path=None): +def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=0, max_df=None, static_tfidf_path=None): print(outfile) # do this step in parallel if we have the memory for it. # should be doable with pool.map @@ -83,7 +83,11 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subre subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df() if static_tfidf_path is not None: - nterms = conn.execute(f"SELECT COUNT(DISTINCT({term_colname + '_id'})) as nterms FROM read_parquet('{static_tfidf_path}/*.parquet') WHERE count >= {min_df} AND count <={max_df}").df() + q = f"SELECT COUNT(DISTINCT({term_colname + '_id'})) as nterms FROM read_parquet('{static_tfidf_path}/*.parquet') WHERE count >= {min_df}" + if max_df is not None: + q = q + f"AND count <= {max_df}" + nterms = conn.execute(q).df() + else: nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df() nterms = nterms.nterms.values @@ -111,7 +115,8 @@ def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/ max_df, included_subreddits, topN, - min_df=2, + min_df=min_df, + max_df=max_df, static_tfidf_path=static_tfidf_path ) @@ -126,14 +131,15 @@ def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/re static_tfidf_path=static_tfidf_path) -def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None): +def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None, min_df=2): return cosine_similarities_weekly_lsi(infile, outfile, 'author', included_subreddits=included_subreddits, n_components=n_components, lsi_model=lsi_model, - static_tfidf_path=static_tfidf_path + static_tfidf_path=static_tfidf_path, + min_df=min_df )