Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex
This commit is contained in:
@@ -4,44 +4,49 @@ from pathlib import Path
|
||||
from similarities_helper import similarities, column_similarities
|
||||
from functools import partial
|
||||
|
||||
def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'):
|
||||
def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'):
|
||||
|
||||
return similarities(inpath=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
|
||||
return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
|
||||
|
||||
# change so that these take in an input as an optional argument (for speed, but also for idf).
|
||||
def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
|
||||
|
||||
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet',
|
||||
def term_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
|
||||
|
||||
return cosine_similarities(infile,
|
||||
'term',
|
||||
outfile,
|
||||
min_df,
|
||||
max_df,
|
||||
included_subreddits,
|
||||
topN,
|
||||
exclude_phrases,
|
||||
from_date,
|
||||
to_date
|
||||
)
|
||||
|
||||
def author_cosine_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
|
||||
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',
|
||||
def author_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
|
||||
return cosine_similarities(infile,
|
||||
'author',
|
||||
outfile,
|
||||
min_df,
|
||||
max_df,
|
||||
included_subreddits,
|
||||
topN,
|
||||
exclude_phrases=False,
|
||||
from_date=from_date,
|
||||
to_date=to_date
|
||||
)
|
||||
|
||||
def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
|
||||
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',
|
||||
def author_tf_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
|
||||
return cosine_similarities(infile,
|
||||
'author',
|
||||
outfile,
|
||||
min_df,
|
||||
max_df,
|
||||
included_subreddits,
|
||||
topN,
|
||||
exclude_phrases=False,
|
||||
from_date=from_date,
|
||||
to_date=to_date,
|
||||
tfidf_colname='relative_tf'
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/bash
|
||||
start_spark_cluster.sh
|
||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 lsi_similarities.py author --outfile=/gscratch/comdata/output//reddit_similarity/subreddit_comment_authors_10k_LSI.feather --topN=10000
|
||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname):7077 top_subreddits_by_comments.py
|
||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh
|
||||
|
||||
@@ -97,6 +97,7 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu
|
||||
'relative_tf':ds.field('relative_tf').cast('float32'),
|
||||
'tf_idf':ds.field('tf_idf').cast('float32')}
|
||||
|
||||
|
||||
df = tfidf_ds.to_table(filter=ds_filter,columns=projection)
|
||||
|
||||
df = df.to_pandas(split_blocks=True,self_destruct=True)
|
||||
@@ -124,6 +125,17 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu
|
||||
|
||||
return (df, tfidf_ds, ds_filter)
|
||||
|
||||
with Pool(cpu_count()) as pool:
|
||||
chunks = pool.imap_unordered(pull_names,batches)
|
||||
subreddit_names = pd.concat(chunks,copy=False).drop_duplicates()
|
||||
|
||||
subreddit_names = subreddit_names.set_index("subreddit_id")
|
||||
new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates()
|
||||
new_ids = new_ids.set_index('subreddit_id')
|
||||
subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index()
|
||||
subreddit_names = subreddit_names.drop("subreddit_id",1)
|
||||
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||
return(df, subreddit_names)
|
||||
|
||||
def pull_names(batch):
|
||||
return(batch.to_pandas().drop_duplicates())
|
||||
@@ -165,7 +177,6 @@ def similarities(inpath, simfunc, term_colname, outfile, min_df=None, max_df=Non
|
||||
|
||||
print(f'computing similarities on mat. mat.shape:{mat.shape}')
|
||||
print(f"size of mat is:{mat.data.nbytes}",flush=True)
|
||||
# transform this to debug term tfidf
|
||||
sims = simfunc(mat)
|
||||
del mat
|
||||
|
||||
@@ -256,13 +267,12 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196
|
||||
yield (sims, n_dims)
|
||||
else:
|
||||
return sims
|
||||
|
||||
|
||||
def column_similarities(mat):
|
||||
return 1 - pairwise_distances(mat,metric='cosine')
|
||||
|
||||
# need to rewrite this so that subreddit ids and term ids are fixed over the whole thing.
|
||||
# this affords taking the LSI similarities.
|
||||
# fill all 0s if we don't have it.
|
||||
|
||||
def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05):
|
||||
term = term_colname
|
||||
term_id = term + '_id'
|
||||
@@ -295,7 +305,6 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig
|
||||
subreddits = df.select(['subreddit']).distinct()
|
||||
subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit")))
|
||||
|
||||
# df = df.cache()
|
||||
df = df.join(subreddits,on=['subreddit'])
|
||||
|
||||
# map terms to indexes in the tfs and the idfs
|
||||
|
||||
@@ -52,7 +52,7 @@ def tfidf_terms(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/commen
|
||||
|
||||
def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
|
||||
topN=None,
|
||||
include_subreddits=None):
|
||||
included_subreddits=None):
|
||||
|
||||
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
|
||||
outpath,
|
||||
@@ -63,7 +63,8 @@ def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfi
|
||||
)
|
||||
|
||||
def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
|
||||
topN=25000):
|
||||
topN=None,
|
||||
included_subreddits=None):
|
||||
|
||||
|
||||
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
|
||||
@@ -71,7 +72,7 @@ def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf
|
||||
topN,
|
||||
'term',
|
||||
[],
|
||||
included_subreddits=None
|
||||
included_subreddits=included_subreddits
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ df = df.filter(~df.subreddit.like("u_%"))
|
||||
df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments"))
|
||||
|
||||
df = df.join(prop_nsfw,on='subreddit')
|
||||
df = df.filter(df.prop_nsfw < 0.5)
|
||||
#df = df.filter(df.prop_nsfw < 0.5)
|
||||
|
||||
win = Window.orderBy(f.col('n_comments').desc())
|
||||
df = df.withColumn('comments_rank', f.rank().over(win))
|
||||
@@ -26,4 +26,4 @@ df = df.toPandas()
|
||||
|
||||
df = df.sort_values("n_comments")
|
||||
|
||||
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False)
|
||||
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nsfw.csv', index=False)
|
||||
|
||||
Reference in New Issue
Block a user