refactor similarities to use submodule.
This commit is contained in:
parent
98c1317af5
commit
930ee47d2b
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
[submodule "cdsc_ecology_utils"]
|
||||||
|
path = cdsc_ecology_utils
|
||||||
|
url = code:cdsc_ecology_utils
|
@ -1,2 +1,2 @@
|
|||||||
from .timeseries import load_clusters, load_densities, build_cluster_timeseries
|
from timeseries import load_clusters, load_densities, build_cluster_timeseries
|
||||||
|
from cdsc_ecology_utils import similarity_functions
|
||||||
|
1
cdsc_ecology_utils
Submodule
1
cdsc_ecology_utils
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 5b5fdbb3c02fc9d60a784de5635812bf97020d3e
|
@ -1,12 +1,12 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import fire
|
import fire
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from similarities_helper import similarities, column_similarities
|
from cdsc_ecology_utils.similarity import similarities, column_similarities
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'):
|
def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'):
|
||||||
|
|
||||||
return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
|
return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_communities=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
|
||||||
|
|
||||||
# change so that these take in an input as an optional argument (for speed, but also for idf).
|
# change so that these take in an input as an optional argument (for speed, but also for idf).
|
||||||
def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
|
def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import fire
|
import fire
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from similarities_helper import *
|
from cdsc_ecology_utils.similarity.similarity_functions import lsi_column_similarities, similarities,
|
||||||
#from similarities_helper import similarities, lsi_column_similarities
|
#from similarities_helper import similarities, lsi_column_similarities
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
@ -30,7 +30,7 @@ def lsi_similarities(inpath, term_colname, outfile, min_df=None, max_df=None, in
|
|||||||
|
|
||||||
simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm=algorithm,lsi_model_save=lsi_model)
|
simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm=algorithm,lsi_model_save=lsi_model)
|
||||||
|
|
||||||
return similarities(inpath=inpath, simfunc=simfunc, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
|
return similarities(inpath=inpath, simfunc=simfunc, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_communities=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
|
||||||
|
|
||||||
# change so that these take in an input as an optional argument (for speed, but also for idf).
|
# change so that these take in an input as an optional argument (for speed, but also for idf).
|
||||||
def term_lsi_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet',outfile=None, min_df=None, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None, algorithm='arpack', n_components=300,n_iter=5,random_state=1968):
|
def term_lsi_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet',outfile=None, min_df=None, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None, algorithm='arpack', n_components=300,n_iter=5,random_state=1968):
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
import fire
|
import fire
|
||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
from pyspark.sql import functions as f
|
from pyspark.sql import functions as f
|
||||||
from similarities_helper import tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits
|
from cdsc_ecology_utils.similarity.similarity_functions import tfidf_dataset, \
|
||||||
|
build_weekly_tfidf_dataset, select_topN_communities
|
||||||
|
|
||||||
def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits):
|
def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits):
|
||||||
spark = SparkSession.builder.getOrCreate()y
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
|
||||||
df = spark.read.parquet(inpath)
|
df = spark.read.parquet(inpath)
|
||||||
|
|
||||||
@ -13,7 +14,7 @@ def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_
|
|||||||
if included_subreddits is not None:
|
if included_subreddits is not None:
|
||||||
include_subs = set(map(str.strip,open(included_subreddits)))
|
include_subs = set(map(str.strip,open(included_subreddits)))
|
||||||
else:
|
else:
|
||||||
include_subs = select_topN_subreddits(topN)
|
include_subs = select_topN_communities(topN)
|
||||||
|
|
||||||
dfwriter = func(df, include_subs, term_colname)
|
dfwriter = func(df, include_subs, term_colname)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user