Changes from hyak.
This commit is contained in:
parent
140d1bdd17
commit
4dc949de5f
@ -1,10 +1,32 @@
|
|||||||
#srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28'
|
#srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28'
|
||||||
all:/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather
|
all:/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather
|
||||||
|
#all:/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather /gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather /gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather
|
||||||
|
|
||||||
/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
|
/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
|
||||||
# $srun_cdsc python3
|
# $srun_cdsc python3
|
||||||
./clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.85 --damping=0.85
|
start_spark_and_run.sh 1 clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.85 --damping=0.85
|
||||||
|
|
||||||
/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
|
/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
|
||||||
# $srun_cdsc python3
|
# $srun_cdsc python3
|
||||||
./clustering.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather ---max_iter=1000 --convergence_iter=15 --preference_quantile=0.9 --damping=0.5
|
start_spark_and_run.sh 1 clustering.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather ---max_iter=1000 --convergence_iter=15 --preference_quantile=0.9 --damping=0.5
|
||||||
|
|
||||||
|
/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
|
||||||
|
# $srun_cdsc
|
||||||
|
start_spark_and_run.sh 1 clustering.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.5 --damping=0.85
|
||||||
|
|
||||||
|
# it's pretty difficult to get a result that isn't one huge megacluster. A sign that it's bullcrap
|
||||||
|
# /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather
|
||||||
|
# ./clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.9 --damping=0.85
|
||||||
|
|
||||||
|
/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
|
||||||
|
|
||||||
|
start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet --output=/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather
|
||||||
|
|
||||||
|
|
||||||
|
# /gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather
|
||||||
|
|
||||||
|
# python3 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather --output=/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather
|
||||||
|
|
||||||
|
/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
|
||||||
|
# $srun_cdsc python3
|
||||||
|
start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --output=/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather
|
||||||
|
@ -5,7 +5,7 @@ from numpy import random
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.manifold import TSNE
|
from sklearn.manifold import TSNE
|
||||||
|
|
||||||
similarities = "term_similarities_10000.feather"
|
similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet"
|
||||||
|
|
||||||
def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=10000, early_exaggeration=20):
|
def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=10000, early_exaggeration=20):
|
||||||
'''
|
'''
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscratch/comdata/output/reddit_density/comment_authors_10000.feather
|
all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscratch/comdata/output/reddit_density/comment_authors_10000.feather /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather
|
||||||
|
|
||||||
/gscratch/comdata/output/reddit_density/comment_terms_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
|
/gscratch/comdata/output/reddit_density/comment_terms_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
|
||||||
python3 overlap_density.py terms --inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather" --agg=pd.DataFrame.sum
|
start_spark_and_run.sh 1 overlap_density.py terms --inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather" --agg=pd.DataFrame.sum
|
||||||
|
|
||||||
/gscratch/comdata/output/reddit_density/comment_authors_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
|
/gscratch/comdata/output/reddit_density/comment_authors_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
|
||||||
python3 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather" --agg=pd.DataFrame.sum
|
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather" --agg=pd.DataFrame.sum
|
||||||
|
|
||||||
|
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
|
||||||
|
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#!/usr/bin/bash
|
#!/usr/bin/bash
|
||||||
start_spark_cluster.sh
|
start_spark_cluster.sh
|
||||||
spark-submit --master spark://$(hostname):18899 overlap_density.py wang_overlaps --inpath=/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet --to_date=2020-04-13
|
spark-submit --master spark://$(hostname):18899 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --outpath=/gscratch/comdata/output/reddit_density/comment_authors_10000.feather --agg=pd.DataFrame.sum
|
||||||
stop-all.sh
|
stop-all.sh
|
||||||
|
@ -5,7 +5,7 @@ import numpy as np
|
|||||||
import sys
|
import sys
|
||||||
sys.path.append("..")
|
sys.path.append("..")
|
||||||
sys.path.append("../similarities")
|
sys.path.append("../similarities")
|
||||||
from similarities.similarities_helper import read_tfidf_matrix, reindex_tfidf, reindex_tfidf_time_interval
|
from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval
|
||||||
|
|
||||||
# this is the mean of the ratio of the overlap to the focal size.
|
# this is the mean of the ratio of the overlap to the focal size.
|
||||||
# mean shared membership per focal community member
|
# mean shared membership per focal community member
|
||||||
@ -72,5 +72,5 @@ if __name__ == "__main__":
|
|||||||
fire.Fire({'authors':author_overlap_density,
|
fire.Fire({'authors':author_overlap_density,
|
||||||
'terms':term_overlap_density,
|
'terms':term_overlap_density,
|
||||||
'author_weekly':author_overlap_density_weekly,
|
'author_weekly':author_overlap_density_weekly,
|
||||||
'term_weekly':term_overlap_density_weekly,
|
'term_weekly':term_overlap_density_weekly})
|
||||||
'wang_overlaps':wang_overlap_density})
|
|
||||||
|
@ -1,13 +1,25 @@
|
|||||||
all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet
|
all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms.parquet
|
||||||
|
|
||||||
/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet
|
# all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet
|
||||||
start_spark_and_run.sh 1 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.feather
|
|
||||||
|
|
||||||
/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
|
|
||||||
start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.feather
|
|
||||||
|
|
||||||
/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
|
# /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
|
||||||
start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.feather
|
# start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.feather
|
||||||
|
|
||||||
/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
|
/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv
|
||||||
start_spark_and_run.sh 1 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10000_weely.parquet
|
start_spark_and_run.sh 1 tfidf.py terms --topN=10000
|
||||||
|
|
||||||
|
/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv
|
||||||
|
start_spark_and_run.sh 1 tfidf.py authors --topN=10000
|
||||||
|
|
||||||
|
/gscratch/comdata/output/reddit_similarity/comment_authors_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
|
||||||
|
start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
|
||||||
|
|
||||||
|
/gscratch/comdata/output/reddit_similarity/comment_terms.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet
|
||||||
|
start_spark_and_run.sh 1 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
|
||||||
|
|
||||||
|
# /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet
|
||||||
|
# start_spark_and_run.sh 1 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10000_weely.parquet
|
||||||
|
|
||||||
|
/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
|
||||||
|
start_spark_and_run.sh 1 cosine_similarities.py author-tf --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
|
||||||
|
Binary file not shown.
@ -1,30 +1,53 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import fire
|
import fire
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from similarities_helper import similarities
|
from similarities_helper import similarities, column_similarities
|
||||||
|
|
||||||
def cosine_similarities(infile, term_colname, outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False,from_date=None, to_date=None):
|
def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'):
|
||||||
return similiarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date)
|
|
||||||
|
|
||||||
def term_cosine_similarities(outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
|
return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
|
||||||
|
|
||||||
|
|
||||||
|
def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
|
||||||
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
|
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
|
||||||
'term',
|
'term',
|
||||||
outfile,
|
outfile,
|
||||||
min_df,
|
min_df,
|
||||||
|
max_df,
|
||||||
included_subreddits,
|
included_subreddits,
|
||||||
topN,
|
topN,
|
||||||
exclude_phrasesby.)
|
exclude_phrases,
|
||||||
|
from_date,
|
||||||
|
to_date)
|
||||||
|
|
||||||
def author_cosine_similarities(outfile, min_df=2, included_subreddits=None, topN=10000, from_date=None, to_date=None):
|
def author_cosine_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
|
||||||
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
|
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
|
||||||
'author',
|
'author',
|
||||||
outfile,
|
outfile,
|
||||||
min_df,
|
min_df,
|
||||||
|
max_df,
|
||||||
included_subreddits,
|
included_subreddits,
|
||||||
topN,
|
topN,
|
||||||
exclude_phrases=False)
|
exclude_phrases=False,
|
||||||
|
from_date=from_date,
|
||||||
|
to_date=to_date)
|
||||||
|
|
||||||
|
def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
|
||||||
|
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
|
||||||
|
'author',
|
||||||
|
outfile,
|
||||||
|
min_df,
|
||||||
|
max_df,
|
||||||
|
included_subreddits,
|
||||||
|
topN,
|
||||||
|
exclude_phrases=False,
|
||||||
|
from_date=from_date,
|
||||||
|
to_date=to_date,
|
||||||
|
tfidf_colname='relative_tf')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
fire.Fire({'term':term_cosine_similarities,
|
fire.Fire({'term':term_cosine_similarities,
|
||||||
'author':author_cosine_similarities})
|
'author':author_cosine_similarities,
|
||||||
|
'author-tf':author_tf_similarities})
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#!/usr/bin/bash
|
#!/usr/bin/bash
|
||||||
start_spark_cluster.sh
|
start_spark_cluster.sh
|
||||||
spark-submit --master spark://$(hostname):18899 wang_similarity.py --infile=/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet --max_df=10 --outfile=/gscratch/comdata/output/reddit_similarity/wang_similarity_1000_max10.feather
|
spark-submit --master spark://$(hostname):18899 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
|
||||||
stop-all.sh
|
stop-all.sh
|
||||||
|
@ -75,17 +75,20 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre
|
|||||||
spark.stop()
|
spark.stop()
|
||||||
return (tempdir, subreddit_names)
|
return (tempdir, subreddit_names)
|
||||||
|
|
||||||
def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
|
|
||||||
|
|
||||||
|
def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'):
|
||||||
|
'''
|
||||||
|
tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities.
|
||||||
|
'''
|
||||||
if from_date is not None or to_date is not None:
|
if from_date is not None or to_date is not None:
|
||||||
tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname='author', min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date)
|
tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
tempdir, subreddit_names = reindex_tfidf(infile, term_colname='author', min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False)
|
tempdir, subreddit_names = reindex_tfidf(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False)
|
||||||
|
|
||||||
print("loading matrix")
|
print("loading matrix")
|
||||||
# mat = read_tfidf_matrix("term_tfidf_entries7ejhvnvl.parquet", term_colname)
|
# mat = read_tfidf_matrix("term_tfidf_entries7ejhvnvl.parquet", term_colname)
|
||||||
mat = read_tfidf_matrix(tempdir.name, term_colname)
|
mat = read_tfidf_matrix(tempdir.name, term_colname, tfidf_colname)
|
||||||
print('computing similarities')
|
print('computing similarities')
|
||||||
sims = simfunc(mat)
|
sims = simfunc(mat)
|
||||||
del mat
|
del mat
|
||||||
@ -108,14 +111,24 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non
|
|||||||
sims.to_feather(outfile)
|
sims.to_feather(outfile)
|
||||||
tempdir.cleanup()
|
tempdir.cleanup()
|
||||||
|
|
||||||
def read_tfidf_matrix_weekly(path, term_colname, week):
|
def read_tfidf_matrix_weekly(path, term_colname, week, tfidf_colname='tf_idf'):
|
||||||
term = term_colname
|
term = term_colname
|
||||||
term_id = term + '_id'
|
term_id = term + '_id'
|
||||||
term_id_new = term + '_id_new'
|
term_id_new = term + '_id_new'
|
||||||
|
|
||||||
dataset = ds.dataset(path,format='parquet')
|
dataset = ds.dataset(path,format='parquet')
|
||||||
entries = dataset.to_table(columns=['tf_idf','subreddit_id_new',term_id_new],filter=ds.field('week')==week).to_pandas()
|
entries = dataset.to_table(columns=[tfidf_colname,'subreddit_id_new', term_id_new],filter=ds.field('week')==week).to_pandas()
|
||||||
return(csr_matrix((entries.tf_idf,(entries[term_id_new]-1, entries.subreddit_id_new-1))))
|
return(csr_matrix((entries[tfidf_colname], (entries[term_id_new]-1, entries.subreddit_id_new-1))))
|
||||||
|
|
||||||
|
def read_tfidf_matrix(path, term_colname, tfidf_colname='tf_idf'):
|
||||||
|
term = term_colname
|
||||||
|
term_id = term + '_id'
|
||||||
|
term_id_new = term + '_id_new'
|
||||||
|
dataset = ds.dataset(path,format='parquet')
|
||||||
|
print(f"tfidf_colname:{tfidf_colname}")
|
||||||
|
entries = dataset.to_table(columns=[tfidf_colname, 'subreddit_id_new',term_id_new]).to_pandas()
|
||||||
|
return(csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1))))
|
||||||
|
|
||||||
|
|
||||||
def write_weekly_similarities(path, sims, week, names):
|
def write_weekly_similarities(path, sims, week, names):
|
||||||
sims['week'] = week
|
sims['week'] = week
|
||||||
@ -127,15 +140,6 @@ def write_weekly_similarities(path, sims, week, names):
|
|||||||
sims = sims.melt(id_vars=['subreddit','week'],value_vars=names.subreddit.values)
|
sims = sims.melt(id_vars=['subreddit','week'],value_vars=names.subreddit.values)
|
||||||
sims.to_parquet(p / week.isoformat())
|
sims.to_parquet(p / week.isoformat())
|
||||||
|
|
||||||
def read_tfidf_matrix(path,term_colname):
|
|
||||||
term = term_colname
|
|
||||||
term_id = term + '_id'
|
|
||||||
term_id_new = term + '_id_new'
|
|
||||||
|
|
||||||
dataset = ds.dataset(path,format='parquet')
|
|
||||||
entries = dataset.to_table(columns=['tf_idf','subreddit_id_new',term_id_new]).to_pandas()
|
|
||||||
return(csr_matrix((entries.tf_idf,(entries[term_id_new]-1, entries.subreddit_id_new-1))))
|
|
||||||
|
|
||||||
def column_overlaps(mat):
|
def column_overlaps(mat):
|
||||||
non_zeros = (mat != 0).astype('double')
|
non_zeros = (mat != 0).astype('double')
|
||||||
|
|
||||||
@ -383,7 +387,7 @@ def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm
|
|||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv"):
|
def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonswf.csv"):
|
||||||
rankdf = pd.read_csv(path)
|
rankdf = pd.read_csv(path)
|
||||||
included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
|
included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
|
||||||
return included_subreddits
|
return included_subreddits
|
||||||
|
@ -58,12 +58,13 @@ def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfi
|
|||||||
def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
|
def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
|
||||||
topN=25000):
|
topN=25000):
|
||||||
|
|
||||||
|
|
||||||
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
|
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
|
||||||
outpath,
|
outpath,
|
||||||
topN,
|
topN,
|
||||||
'term',
|
'term',
|
||||||
[]
|
[]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -12,7 +12,7 @@ infile="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
|
|||||||
|
|
||||||
def wang_overlaps(infile, outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather", min_df=1, max_df=None, included_subreddits=None, topN=10000, exclude_phrases=False, from_date=None, to_date=None):
|
def wang_overlaps(infile, outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather", min_df=1, max_df=None, included_subreddits=None, topN=10000, exclude_phrases=False, from_date=None, to_date=None):
|
||||||
|
|
||||||
return similarities(infile=infile, simfunc=wang_similarity, term_colname='author', outfile=outfile, min_df=min_df, max_df=None, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases, from_date=from_date, to_date=to_date)
|
return similarities(infile=infile, simfunc=wang_similarity, term_colname='author', outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases, from_date=from_date, to_date=to_date)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
fire.Fire(wang_overlaps)
|
fire.Fire(wang_overlaps)
|
||||||
|
File diff suppressed because one or more lines are too long
@ -14,7 +14,7 @@ def base_plot(plot_data):
|
|||||||
|
|
||||||
cluster_dropdown = alt.binding_select(options=[str(c) for c in sorted(set(plot_data.cluster))])
|
cluster_dropdown = alt.binding_select(options=[str(c) for c in sorted(set(plot_data.cluster))])
|
||||||
|
|
||||||
subreddit_dropdown = alt.binding_select(options=sorted(plot_data.subreddit))
|
# subreddit_dropdown = alt.binding_select(options=sorted(plot_data.subreddit))
|
||||||
|
|
||||||
cluster_click_select = alt.selection_single(on='click',fields=['cluster'], bind=cluster_dropdown, name=' ')
|
cluster_click_select = alt.selection_single(on='click',fields=['cluster'], bind=cluster_dropdown, name=' ')
|
||||||
# cluster_select = alt.selection_single(fields=['cluster'], bind=cluster_dropdown, name='cluster')
|
# cluster_select = alt.selection_single(fields=['cluster'], bind=cluster_dropdown, name='cluster')
|
||||||
@ -42,7 +42,7 @@ def zoom_plot(plot_data):
|
|||||||
chart = base_plot(plot_data)
|
chart = base_plot(plot_data)
|
||||||
|
|
||||||
chart = chart.interactive()
|
chart = chart.interactive()
|
||||||
chart = chart.properties(width=1275,height=1000)
|
chart = chart.properties(width=1275,height=800)
|
||||||
|
|
||||||
return chart
|
return chart
|
||||||
|
|
||||||
@ -139,11 +139,19 @@ def assign_cluster_colors(tsne_data, clusters, n_colors, n_neighbors = 4):
|
|||||||
|
|
||||||
def build_visualization(tsne_data, clusters, output):
|
def build_visualization(tsne_data, clusters, output):
|
||||||
|
|
||||||
|
# tsne_data = "/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather"
|
||||||
|
# clusters = "/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather"
|
||||||
|
|
||||||
tsne_data = pd.read_feather(tsne_data)
|
tsne_data = pd.read_feather(tsne_data)
|
||||||
clusters = pd.read_feather(clusters)
|
clusters = pd.read_feather(clusters)
|
||||||
|
|
||||||
tsne_data = assign_cluster_colors(tsne_data,clusters,10,8)
|
tsne_data = assign_cluster_colors(tsne_data,clusters,10,8)
|
||||||
|
|
||||||
|
# sr_per_cluster = tsne_data.groupby('cluster').subreddit.count().reset_index()
|
||||||
|
# sr_per_cluster = sr_per_cluster.rename(columns={'subreddit':'cluster_size'})
|
||||||
|
|
||||||
|
tsne_data = tsne_data.merge(sr_per_cluster,on='cluster')
|
||||||
|
|
||||||
term_zoom_plot = zoom_plot(tsne_data)
|
term_zoom_plot = zoom_plot(tsne_data)
|
||||||
|
|
||||||
term_zoom_plot.save(output)
|
term_zoom_plot.save(output)
|
||||||
|
Loading…
Reference in New Issue
Block a user