update clustering scripts
This commit is contained in:
@@ -8,3 +8,9 @@ all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscrat
|
||||
|
||||
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
|
||||
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum
|
||||
|
||||
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather
|
||||
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather" --agg=pd.DataFrame.sum
|
||||
|
||||
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather
|
||||
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/bash
|
||||
start_spark_cluster.sh
|
||||
spark-submit --master spark://$(hostname):18899 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --outpath=/gscratch/comdata/output/reddit_density/comment_authors_10000.feather --agg=pd.DataFrame.sum
|
||||
stop-all.sh
|
||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum
|
||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
import pandas as pd
|
||||
from pandas.core.groupby import DataFrameGroupBy as GroupBy
|
||||
from pathlib import Path
|
||||
import fire
|
||||
import numpy as np
|
||||
import sys
|
||||
sys.path.append("..")
|
||||
sys.path.append("../similarities")
|
||||
from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval
|
||||
from similarities.similarities_helper import reindex_tfidf
|
||||
|
||||
# this is the mean of the ratio of the overlap to the focal size.
|
||||
# mean shared membership per focal community member
|
||||
@@ -13,10 +14,12 @@ from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_i
|
||||
|
||||
def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
|
||||
df = pd.read_feather(inpath)
|
||||
df = df.drop('subreddit',1)
|
||||
df = df.drop('_subreddit',1)
|
||||
np.fill_diagonal(df.values,0)
|
||||
df = agg(df, 0).reset_index()
|
||||
df = df.rename({0:'overlap_density'},axis='columns')
|
||||
outpath = Path(outpath)
|
||||
outpath.parent.mkdir(parents=True, exist_ok = True)
|
||||
df.to_feather(outpath)
|
||||
return df
|
||||
|
||||
@@ -25,6 +28,8 @@ def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
|
||||
# exclude the diagonal
|
||||
df = df.loc[df.subreddit != df.variable]
|
||||
res = agg(df.groupby(['subreddit','week'])).reset_index()
|
||||
outpath = Path(outpath)
|
||||
outpath.parent.mkdir(parents=True, exist_ok = True)
|
||||
res.to_feather(outpath)
|
||||
return res
|
||||
|
||||
|
||||
Reference in New Issue
Block a user