diff --git a/similarities/tfidf.py b/similarities/tfidf.py index 7f579fa..79def3c 100644 --- a/similarities/tfidf.py +++ b/similarities/tfidf.py @@ -27,11 +27,24 @@ def tfidf(inpath, outpath, topN, term_colname, exclude, included_subreddits): def tfidf_weekly(inpath, outpath, topN, term_colname, exclude, included_subreddits): return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits) -def tfidf_authors(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet', +def tfidf_post_comment_authors(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/post_authors.parquet', topN=25000, included_subreddits=None): - return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", + return tfidf("/gscratch/comdata/output/reddit_ngrams/post_comment_authors.parquet", + outpath, + topN, + 'author', + ['[deleted]','AutoModerator'], + included_subreddits=included_subreddits + ) + +def tfidf_authors(inpath="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", + outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet', + topN=25000, + included_subreddits=None): + + return tfidf(inpath, outpath, topN, 'author', @@ -63,6 +76,18 @@ def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfi included_subreddits=included_subreddits ) +def tfidf_post_comment_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/post_comment_authors.parquet', + topN=25000, + included_subreddits=None): + + return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/post_comment_authors.parquet", + outpath, + topN, + 'author', + ['[deleted]','AutoModerator'], + included_subreddits=included_subreddits + ) + def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', topN=25000, included_subreddits=None):