add flag to run without overwriting completed parts.
This commit is contained in:
parent
fcdd2d2272
commit
31aaa03079
@ -86,7 +86,7 @@ def cosine_similarities_weekly_lsi(*args, n_components=100, lsi_model=None, **kw
|
|||||||
return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs)
|
return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs)
|
||||||
|
|
||||||
#tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf_weekly/comment_submission_terms_tfidf.parquet')
|
#tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf_weekly/comment_submission_terms_tfidf.parquet')
|
||||||
def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=0, max_df=None, static_tfidf_path=None, clusters=None, min_date=None, max_date=None, cores=1, term_ids=None):
|
def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=0, max_df=None, static_tfidf_path=None, clusters=None, min_date=None, max_date=None, cores=1, term_ids=None, overwrite=True):
|
||||||
print(outfile)
|
print(outfile)
|
||||||
# do this step in parallel if we have the memory for it.
|
# do this step in parallel if we have the memory for it.
|
||||||
# should be doable with pool.map
|
# should be doable with pool.map
|
||||||
@ -121,6 +121,13 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subre
|
|||||||
weeks = weeks.week.values
|
weeks = weeks.week.values
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
def output_exists(week):
|
||||||
|
outfile = Path(outdir) / str(week) / str(week)
|
||||||
|
return outfile.exists()
|
||||||
|
|
||||||
|
if not overwrite:
|
||||||
|
weeks = [week for week in weeks if not output_exists(week)]
|
||||||
|
|
||||||
if clusters is not None:
|
if clusters is not None:
|
||||||
clusters_raw = pd.read_feather(clusters)
|
clusters_raw = pd.read_feather(clusters)
|
||||||
clusters = duckdb.sql("SELECT A.subreddit AS sr_i, B.subreddit AS sr_j FROM clusters_raw AS A JOIN clusters_raw AS B ON A.cluster == B.cluster WHERE A.cluster != -1 AND B.cluster != -1").df()
|
clusters = duckdb.sql("SELECT A.subreddit AS sr_i, B.subreddit AS sr_j FROM clusters_raw AS A JOIN clusters_raw AS B ON A.cluster == B.cluster WHERE A.cluster != -1 AND B.cluster != -1").df()
|
||||||
@ -163,7 +170,7 @@ def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/re
|
|||||||
term_ids=term_ids)
|
term_ids=term_ids)
|
||||||
|
|
||||||
|
|
||||||
def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None, min_df=None, clusters=None, min_date=None, max_date=None,cores=1,term_ids=None):
|
def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None, min_df=None, clusters=None, min_date=None, max_date=None,cores=1,term_ids=None, overwrite=True):
|
||||||
return cosine_similarities_weekly_lsi(infile,
|
return cosine_similarities_weekly_lsi(infile,
|
||||||
outfile,
|
outfile,
|
||||||
'author',
|
'author',
|
||||||
@ -176,11 +183,12 @@ def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/o
|
|||||||
min_date=min_date,
|
min_date=min_date,
|
||||||
max_date=max_date,
|
max_date=max_date,
|
||||||
cores=cores,
|
cores=cores,
|
||||||
term_ids=term_ids
|
term_ids=term_ids,
|
||||||
|
overwrite=overwrite
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None,clusters=None,cores=1,term_ids=None,min_date=None,max_date=None):
|
def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None,clusters=None,cores=1,term_ids=None,min_date=None,max_date=None, overwrite=True):
|
||||||
return cosine_similarities_weekly_lsi(infile,
|
return cosine_similarities_weekly_lsi(infile,
|
||||||
outfile,
|
outfile,
|
||||||
'term',
|
'term',
|
||||||
@ -192,7 +200,8 @@ def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/out
|
|||||||
min_date=min_date,
|
min_date=min_date,
|
||||||
max_date=max_date,
|
max_date=max_date,
|
||||||
cores=cores,
|
cores=cores,
|
||||||
term_ids=term_ids
|
term_ids=term_ids,
|
||||||
|
overwrite=overwrite
|
||||||
)
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
Loading…
Reference in New Issue
Block a user