diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..835c9cb --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "cdsc_ecology_utils"] + path = cdsc_ecology_utils + url = code:cdsc_ecology_utils diff --git a/__init__.py b/__init__.py index dbb8061..393bc3c 100644 --- a/__init__.py +++ b/__init__.py @@ -1,2 +1,2 @@ -from .timeseries import load_clusters, load_densities, build_cluster_timeseries - +from timeseries import load_clusters, load_densities, build_cluster_timeseries +from cdsc_ecology_utils import similarity_functions diff --git a/cdsc_ecology_utils b/cdsc_ecology_utils new file mode 160000 index 0000000..5b5fdbb --- /dev/null +++ b/cdsc_ecology_utils @@ -0,0 +1 @@ +Subproject commit 5b5fdbb3c02fc9d60a784de5635812bf97020d3e diff --git a/similarities/cosine_similarities.py b/similarities/cosine_similarities.py index b9bab17..e1a9efa 100644 --- a/similarities/cosine_similarities.py +++ b/similarities/cosine_similarities.py @@ -1,12 +1,12 @@ import pandas as pd import fire from pathlib import Path -from similarities_helper import similarities, column_similarities +from cdsc_ecology_utils.similarity import similarities, column_similarities from functools import partial def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'): - return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) + return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_communities=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) # change so that these take in an input as an optional argument (for speed, but also for idf). def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): diff --git a/similarities/lsi_similarities.py b/similarities/lsi_similarities.py index eb89f55..2704a39 100644 --- a/similarities/lsi_similarities.py +++ b/similarities/lsi_similarities.py @@ -1,7 +1,7 @@ import pandas as pd import fire from pathlib import Path -from similarities_helper import * +from cdsc_ecology_utils.similarity.similarity_functions import lsi_column_similarities, similarities, #from similarities_helper import similarities, lsi_column_similarities from functools import partial @@ -30,7 +30,7 @@ def lsi_similarities(inpath, term_colname, outfile, min_df=None, max_df=None, in simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm=algorithm,lsi_model_save=lsi_model) - return similarities(inpath=inpath, simfunc=simfunc, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) + return similarities(inpath=inpath, simfunc=simfunc, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_communities=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) # change so that these take in an input as an optional argument (for speed, but also for idf). def term_lsi_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet',outfile=None, min_df=None, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None, algorithm='arpack', n_components=300,n_iter=5,random_state=1968): diff --git a/similarities/tfidf.py b/similarities/tfidf.py index 01b0b20..f17f586 100644 --- a/similarities/tfidf.py +++ b/similarities/tfidf.py @@ -1,10 +1,11 @@ import fire from pyspark.sql import SparkSession from pyspark.sql import functions as f -from similarities_helper import tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits +from cdsc_ecology_utils.similarity.similarity_functions import tfidf_dataset, \ + build_weekly_tfidf_dataset, select_topN_communities def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits): - spark = SparkSession.builder.getOrCreate()y + spark = SparkSession.builder.getOrCreate() df = spark.read.parquet(inpath) @@ -13,7 +14,7 @@ def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_ if included_subreddits is not None: include_subs = set(map(str.strip,open(included_subreddits))) else: - include_subs = select_topN_subreddits(topN) + include_subs = select_topN_communities(topN) dfwriter = func(df, include_subs, term_colname)