update clustering scripts
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/bash
|
||||
start_spark_cluster.sh
|
||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 lsi_similarities.py author --outfile=/gscratch/comdata/output//reddit_similarity/subreddit_comment_authors_10k_LSI.feather --topN=10000
|
||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname):7077 top_subreddits_by_comments.py
|
||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh
|
||||
|
||||
@@ -17,7 +17,7 @@ df = df.filter(~df.subreddit.like("u_%"))
|
||||
df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments"))
|
||||
|
||||
df = df.join(prop_nsfw,on='subreddit')
|
||||
df = df.filter(df.prop_nsfw < 0.5)
|
||||
#df = df.filter(df.prop_nsfw < 0.5)
|
||||
|
||||
win = Window.orderBy(f.col('n_comments').desc())
|
||||
df = df.withColumn('comments_rank', f.rank().over(win))
|
||||
@@ -26,4 +26,4 @@ df = df.toPandas()
|
||||
|
||||
df = df.sort_values("n_comments")
|
||||
|
||||
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False)
|
||||
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nsfw.csv', index=False)
|
||||
|
||||
Reference in New Issue
Block a user