diff --git a/similarities/weekly_cosine_similarities.py b/similarities/weekly_cosine_similarities.py index 5e0c908..7e1d668 100755 --- a/similarities/weekly_cosine_similarities.py +++ b/similarities/weekly_cosine_similarities.py @@ -86,7 +86,7 @@ def cosine_similarities_weekly_lsi(*args, n_components=100, lsi_model=None, **kw return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs) #tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf_weekly/comment_submission_terms_tfidf.parquet') -def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=0, max_df=None, static_tfidf_path=None, clusters=None, min_date=None, max_date=None, cores=1, term_ids=None): +def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=0, max_df=None, static_tfidf_path=None, clusters=None, min_date=None, max_date=None, cores=1, term_ids=None, overwrite=True): print(outfile) # do this step in parallel if we have the memory for it. # should be doable with pool.map @@ -121,6 +121,13 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subre weeks = weeks.week.values conn.close() + def output_exists(week): + outfile = Path(outdir) / str(week) / str(week) + return outfile.exists() + + if not overwrite: + weeks = [week for week in weeks if not output_exists(week)] + if clusters is not None: clusters_raw = pd.read_feather(clusters) clusters = duckdb.sql("SELECT A.subreddit AS sr_i, B.subreddit AS sr_j FROM clusters_raw AS A JOIN clusters_raw AS B ON A.cluster == B.cluster WHERE A.cluster != -1 AND B.cluster != -1").df() @@ -163,7 +170,7 @@ def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/re term_ids=term_ids) -def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None, min_df=None, clusters=None, min_date=None, max_date=None,cores=1,term_ids=None): +def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None, min_df=None, clusters=None, min_date=None, max_date=None,cores=1,term_ids=None, overwrite=True): return cosine_similarities_weekly_lsi(infile, outfile, 'author', @@ -176,11 +183,12 @@ def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/o min_date=min_date, max_date=max_date, cores=cores, - term_ids=term_ids + term_ids=term_ids, + overwrite=overwrite ) -def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None,clusters=None,cores=1,term_ids=None,min_date=None,max_date=None): +def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None,clusters=None,cores=1,term_ids=None,min_date=None,max_date=None, overwrite=True): return cosine_similarities_weekly_lsi(infile, outfile, 'term', @@ -192,7 +200,8 @@ def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/out min_date=min_date, max_date=max_date, cores=cores, - term_ids=term_ids + term_ids=term_ids, + overwrite=overwrite ) if __name__ == "__main__":