1
0

update clustering scripts

This commit is contained in:
Nate E TeBlunthuis
2021-08-03 14:55:02 -07:00
parent 87ffaa6858
commit cf86c7492c
11 changed files with 73 additions and 27 deletions

View File

@@ -1,4 +1,4 @@
#!/usr/bin/bash
start_spark_cluster.sh
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 lsi_similarities.py author --outfile=/gscratch/comdata/output//reddit_similarity/subreddit_comment_authors_10k_LSI.feather --topN=10000
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname):7077 top_subreddits_by_comments.py
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh

View File

@@ -17,7 +17,7 @@ df = df.filter(~df.subreddit.like("u_%"))
df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments"))
df = df.join(prop_nsfw,on='subreddit')
df = df.filter(df.prop_nsfw < 0.5)
#df = df.filter(df.prop_nsfw < 0.5)
win = Window.orderBy(f.col('n_comments').desc())
df = df.withColumn('comments_rank', f.rank().over(win))
@@ -26,4 +26,4 @@ df = df.toPandas()
df = df.sort_values("n_comments")
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False)
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nsfw.csv', index=False)