support passing in list of tfidf vectors.
Also lowercases included subreddits.
This commit is contained in:
		
							parent
							
								
									003a48aea5
								
							
						
					
					
						commit
						806cfc948f
					
				| @ -8,9 +8,9 @@ def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, | ||||
|     return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) | ||||
| 
 | ||||
| 
 | ||||
| def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): | ||||
| def term_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): | ||||
| 
 | ||||
|     return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', | ||||
|     return cosine_similarities(infile, | ||||
|                                'term', | ||||
|                                outfile, | ||||
|                                min_df, | ||||
| @ -22,8 +22,8 @@ def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subredd | ||||
|                                to_date | ||||
|                                ) | ||||
| 
 | ||||
| def author_cosine_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): | ||||
|     return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', | ||||
| def author_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): | ||||
|     return cosine_similarities(infile, | ||||
|                                'author', | ||||
|                                outfile, | ||||
|                                min_df, | ||||
| @ -35,8 +35,8 @@ def author_cosine_similarities(outfile, min_df=2, max_df=None, included_subreddi | ||||
|                                to_date=to_date | ||||
|                                ) | ||||
| 
 | ||||
| def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): | ||||
|     return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', | ||||
| def author_tf_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): | ||||
|     return cosine_similarities(infile, | ||||
|                                'author', | ||||
|                                outfile, | ||||
|                                min_df, | ||||
|  | ||||
| @ -60,7 +60,7 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre | ||||
|     if included_subreddits is None: | ||||
|         included_subreddits = select_topN_subreddits(topN) | ||||
|     else: | ||||
|         included_subreddits = set(open(included_subreddits)) | ||||
|         included_subreddits = set(map(str.strip,map(str.lower,open(included_subreddits)))) | ||||
| 
 | ||||
|     if exclude_phrases == True: | ||||
|         tfidf = tfidf.filter(~f.col(term_colname).contains("_")) | ||||
|  | ||||
| @ -11,7 +11,7 @@ def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_ | ||||
|     df = df.filter(~ f.col(term_colname).isin(exclude)) | ||||
| 
 | ||||
|     if included_subreddits is not None: | ||||
|         include_subs = list(open(included_subreddits)) | ||||
|         include_subs = set(map(str.strip,map(str.lower, open(included_subreddits)))) | ||||
|     else: | ||||
|         include_subs = select_topN_subreddits(topN) | ||||
| 
 | ||||
| @ -28,40 +28,44 @@ def tfidf_weekly(inpath, outpath, topN, term_colname, exclude, included_subreddi | ||||
|     return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits) | ||||
| 
 | ||||
| def tfidf_authors(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet', | ||||
|                   topN=25000): | ||||
|                   topN=25000, | ||||
|                   included_subreddits=None): | ||||
| 
 | ||||
|     return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", | ||||
|                  outpath, | ||||
|                  topN, | ||||
|                  'author', | ||||
|                  ['[deleted]','AutoModerator'], | ||||
|                  included_subreddits=None | ||||
|                  included_subreddits=included_subreddits | ||||
|                  ) | ||||
| 
 | ||||
| def tfidf_terms(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet', | ||||
|                 topN=25000): | ||||
|                 topN=25000, | ||||
|                 included_subreddits=None): | ||||
| 
 | ||||
|     return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", | ||||
|                  outpath, | ||||
|                  topN, | ||||
|                  'term', | ||||
|                  [], | ||||
|                  included_subreddits=None | ||||
|                  included_subreddits=included_subreddits | ||||
|                  ) | ||||
| 
 | ||||
| def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', | ||||
|                          topN=25000): | ||||
|                          topN=25000, | ||||
|                          included_subreddits=None): | ||||
| 
 | ||||
|     return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", | ||||
|                         outpath, | ||||
|                         topN, | ||||
|                         'author', | ||||
|                         ['[deleted]','AutoModerator'], | ||||
|                         included_subreddits=None | ||||
|                         included_subreddits=included_subreddits | ||||
|                         ) | ||||
| 
 | ||||
| def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', | ||||
|                        topN=25000): | ||||
|                        topN=25000, | ||||
|                        included_subreddits=None): | ||||
| 
 | ||||
| 
 | ||||
|     return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", | ||||
| @ -69,7 +73,7 @@ def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf | ||||
|                         topN, | ||||
|                         'term', | ||||
|                         [], | ||||
|                         included_subreddits=None | ||||
|                         included_subreddits=included_subreddits | ||||
|                         ) | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user