18
0

34 Commits

Author SHA1 Message Date
9345f9de94 make pass keyword arg to dataframe.drop 2023-05-31 09:47:21 -07:00
07b0dff9bc changes for archiving. 2023-05-23 17:18:19 -07:00
811a0d87c4 changes from dirty branch. 2023-05-18 10:29:08 -07:00
c190791364 add 2 more umap parameters 2022-06-08 17:27:37 -07:00
5a40465a62 add support for umap->hdbscan clustering method 2022-06-08 17:01:27 -07:00
55b75ea6fc Merge remote-tracking branch 'refs/remotes/origin/excise_reindex' into excise_reindex 2022-04-06 11:14:13 -07:00
197518a222 git-annex in 2022-04-06 11:11:11 -07:00
65deba5e4e Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex 2022-01-19 14:01:44 -08:00
7b130a30af commit changes from smap project. 2022-01-19 13:57:02 -08:00
98c1317af5 update pushshift dumps. 2021-12-10 21:23:32 -08:00
541e125b28 lsi support for weekly similarities 2021-08-11 22:48:33 -07:00
b7c39a3494 Merge branch 'master' of code:cdsc_reddit into excise_reindex 2021-08-03 15:13:39 -07:00
ce549c6c97 Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex 2021-08-03 15:13:21 -07:00
6e43294a41 Updates to similarities code for smap project. 2021-08-03 15:06:48 -07:00
14ab979f59 Merge branch 'master' of code:cdsc_reddit 2021-08-03 15:03:40 -07:00
2d21ff1137 Merge branch 'master' of code:cdsc_reddit into excise_reindex 2021-08-03 15:02:08 -07:00
Nate E TeBlunthuis
cf86c7492c update clustering scripts 2021-08-03 14:55:02 -07:00
Nate E TeBlunthuis
c6122bb429 Merge branch 'master' of code:cdsc_reddit 2021-07-28 15:32:21 -07:00
Nate E TeBlunthuis
596e1ff339 no longer do we need to get daily dumps 2021-07-28 15:32:04 -07:00
Nate E TeBlunthuis
87ffaa6858 script for picking the best clustering given constraints 2021-05-14 19:10:36 -07:00
Nate E TeBlunthuis
7b14db67de Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex 2021-05-13 22:28:31 -07:00
Nate E TeBlunthuis
0b95bea30e support isolates in visualization 2021-05-13 22:26:58 -07:00
Nate E TeBlunthuis
582cf263ea bug fix in affinity clustering 2021-05-13 22:26:15 -07:00
Nate E TeBlunthuis
8a2248fae1 Merge remote-tracking branch 'origin/excise_reindex' into temp 2021-05-10 18:32:03 -07:00
Nate E TeBlunthuis
47ba04aa97 add script for pulling cluster timeseries 2021-05-10 18:24:22 -07:00
Nate E TeBlunthuis
4cb7eeec80 Refactor to make a decent api. 2021-05-10 13:46:49 -07:00
Nate E TeBlunthuis
f05cb962e0 refactor clustring in object oriented style 2021-05-07 22:33:26 -07:00
Nate E TeBlunthuis
8d1df5b26e refactor clustering.py into method-specific files. 2021-05-03 11:28:48 -07:00
Nate E TeBlunthuis
e1c9d9af6f Remove 'exclude phrases' parameter. 2021-05-03 10:37:09 -07:00
Nate E TeBlunthuis
6a3bfa26ee bugfix 2021-04-26 22:31:05 -07:00
Nate E TeBlunthuis
3a758f1fc8 Merge branch 'charliepatch' of code:cdsc_reddit into charliepatch 2021-04-26 13:58:25 -07:00
Nate E TeBlunthuis
806cfc948f support passing in list of tfidf vectors.
Also lowercases included subreddits.
2021-04-26 13:20:43 -07:00
Nate E TeBlunthuis
0fe120e4ab support passing in list of tfidf vectors.
Also lowercases included subreddits.
2021-04-26 11:44:56 -07:00
Nate E TeBlunthuis
a013f6718b export timeseries functions 2021-03-24 17:18:30 -07:00
75 changed files with 1655 additions and 1583 deletions

2
__init__.py Normal file
View File

@@ -0,0 +1,2 @@
from .timeseries import load_clusters, load_densities, build_cluster_timeseries

View File

@@ -1,74 +0,0 @@
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import FloatType
import zlib
def zlib_entropy_rate(s):
sb = s.encode()
if len(sb) == 0:
return None
else:
return len(zlib.compress(s.encode(),level=6))/len(s.encode())
zlib_entropy_rate_udf = f.udf(zlib_entropy_rate,FloatType())
spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_author.parquet",compression='snappy')
df = df.withColumn("saidbot",f.lower(f.col("body")).like("%bot%"))
# df = df.filter(df.subreddit=='seattle')
# df = df.cache()
botreplies = df.filter(f.lower(df.body).rlike(".*[good|bad] bot.*"))
botreplies = botreplies.select([f.col("parent_id").substr(4,100).alias("bot_comment_id"),f.lower(f.col("body")).alias("good_bad_bot"),f.col("link_id").alias("gbbb_link_id")])
botreplies = botreplies.groupby(['bot_comment_id']).agg(f.count('good_bad_bot').alias("N_goodbad_votes"),
f.sum((f.lower(f.col('good_bad_bot')).like('%good bot%').astype("double"))).alias("n_good_votes"),
f.sum((f.lower(f.col('good_bad_bot')).like('%bad bot%').astype("double"))).alias("n_bad_votes"))
comments_by_author = df.select(['author','id','saidbot']).groupBy('author').agg(f.count('id').alias("N_comments"),
f.mean(f.col('saidbot').astype("double")).alias("prop_saidbot"),
f.sum(f.col('saidbot').astype("double")).alias("n_saidbot"))
# pd_comments_by_author = comments_by_author.toPandas()
# pd_comments_by_author['frac'] = 500 / pd_comments_by_author['N_comments']
# pd_comments_by_author.loc[pd_comments_by_author.frac > 1, 'frac'] = 1
# fractions = pd_comments_by_author.loc[:,['author','frac']]
# fractions = fractions.set_index('author').to_dict()['frac']
# sampled_author_comments = df.sampleBy("author",fractions).groupBy('author').agg(f.concat_ws(" ", f.collect_list('body')).alias('comments'))
df = df.withColumn("randn",f.randn(seed=1968))
win = Window.partitionBy("author").orderBy("randn")
df = df.withColumn("randRank",f.rank().over(win))
sampled_author_comments = df.filter(f.col("randRank") <= 1000)
sampled_author_comments = sampled_author_comments.groupBy('author').agg(f.concat_ws(" ", f.collect_list('body')).alias('comments'))
author_entropy_rates = sampled_author_comments.select(['author',zlib_entropy_rate_udf(f.col('comments')).alias("entropy_rate")])
parents = df.join(botreplies, on=df.id==botreplies.bot_comment_id,how='right_outer')
win1 = Window.partitionBy("author")
parents = parents.withColumn("first_bot_reply",f.min(f.col("CreatedAt")).over(win1))
first_bot_reply = parents.filter(f.col("first_bot_reply")==f.col("CreatedAt"))
first_bot_reply = first_bot_reply.withColumnRenamed("CreatedAt","FB_CreatedAt")
first_bot_reply = first_bot_reply.withColumnRenamed("id","FB_id")
comments_since_first_bot_reply = df.join(first_bot_reply,on = 'author',how='right_outer').filter(f.col("CreatedAt")>=f.col("first_bot_reply"))
comments_since_first_bot_reply = comments_since_first_bot_reply.groupBy("author").agg(f.count("id").alias("N_comments_since_firstbot"))
bots = parents.groupby(['author']).agg(f.sum('N_goodbad_votes').alias("N_goodbad_votes"),
f.sum(f.col('n_good_votes')).alias("n_good_votes"),
f.sum(f.col('n_bad_votes')).alias("n_bad_votes"),
f.count(f.col('author')).alias("N_bot_posts"))
bots = bots.join(comments_by_author,on="author",how='left_outer')
bots = bots.join(comments_since_first_bot_reply,on="author",how='left_outer')
bots = bots.join(author_entropy_rates,on='author',how='left_outer')
bots = bots.orderBy("N_goodbad_votes",ascending=False)
bots = bots.repartition(1)
bots.write.parquet("/gscratch/comdata/output/reddit_good_bad_bot.parquet",mode='overwrite')

View File

@@ -1,76 +1,36 @@
#srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28' srun_singularity=srun -p compute-bigmem -A comdata --time=48:00:00 --mem=362G -c 40 /bin/bash -c
srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh similarity_data=../../data/reddit_similarity
similarity_data=/gscratch/comdata/output/reddit_similarity clustering_data=../../data/reddit_clustering
clustering_data=/gscratch/comdata/output/reddit_clustering kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
kmeans_selection_grid="--max_iter=3000 --n_init=[10] --n_clusters=[100,500,1000,1500,2000,2500,3000,2350,3500,3570,4000]" hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
#selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]" affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]
all:$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv
# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
# $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS
$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py authors_tf_10k_input_lsi=$(similarity_data)/subreddit_comment_authors-tf_10k_LSI
$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/kmeans $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(kmeans_selection_grid) authors_tf_10k_output_lsi=$(clustering_data)/subreddit_comment_authors-tf_10k_LSI
$(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py all:authors_tf_10k_lsi
$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/kmeans $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(kmeans_selection_grid)
$(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather authors_tf_10k_lsi:${authors_tf_10k_output_lsi}/kmeans/selection_data.csv ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(kmeans_selection_grid)
## LSI Models
${authors_tf_10k_output_lsi}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py kmeans_clustering.py
$(srun_singularity) -c "source ~/.bashrc; python3 kmeans_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/kmeans --savefile=${authors_tf_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)"
affinity_selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]" ${authors_tf_10k_output_lsi}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py affinity_clustering.py
$(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py $(srun_singularity) -c "source ~/.bashrc; python3 affinity_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/affinity --savefile=${authors_tf_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)"
$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/affinity $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20
$(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/affinity $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 $(srun_singularity) -c "source ~/.bashrc; python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)"
$(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather ${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/affinity $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 $(srun_singularity) -c "source ~/.bashrc; python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2"
${authors_tf_10k_input_lsi}:
$(MAKE) -C ../similarities
clean: clean:
rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv
PHONY: clean PHONY: clean
# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
# $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS
# $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_terms_30k.feather clustering.py
# $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_30k.feather $(clustering_data)/subreddit_comment_terms_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS
# $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS:clustering.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather
# $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather $(clustering_data)/subreddit_comment_authors-tf_30k $(selection_grid) -J 8 && touch $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
# $(clustering_data)/subreddit_comment_authors_100k.feather:clustering.py $(similarity_data)/subreddit_comment_authors_100k.feather
# $(srun_singularity) python3 clustering.py $(similarity_data)/subreddit_comment_authors_100k.feather $(clustering_data)/subreddit_comment_authors_100k.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.85 --damping=0.85
# $(clustering_data)/comment_terms_100k.feather:clustering.py $(similarity_data)/subreddit_comment_terms_100k.feather
# $(srun_singularity) python3 clustering.py $(similarity_data)/comment_terms_10000.feather $(clustering_data)/comment_terms_10000.feather ---max_iter=1000 --convergence_iter=15 --preference_quantile=0.9 --damping=0.5
# $(clustering_data)/subreddit_comment_author-tf_100k.feather:clustering.py $(similarity_data)/subreddit_comment_author-tf_100k.feather
# $(srun_singularity) python3 clustering.py $(similarity_data)/subreddit_comment_author-tf_100k.parquet $(clustering_data)/subreddit_comment_author-tf_100k.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.5 --damping=0.85
# it's pretty difficult to get a result that isn't one huge megacluster. A sign that it's bullcrap
# /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather
# ./clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.9 --damping=0.85
# /gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
# start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet --output=/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather
# /gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather
# python3 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather --output=/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather
# /gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
# # $srun_cdsc python3
# start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --output=/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather

View File

@@ -0,0 +1,129 @@
from sklearn.cluster import AffinityPropagation
from dataclasses import dataclass
from clustering_base import clustering_result, clustering_job
from grid_sweep import grid_sweep
from pathlib import Path
from itertools import product, starmap
import fire
import sys
import numpy as np
# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying.
@dataclass
class affinity_clustering_result(clustering_result):
damping:float
convergence_iter:int
preference_quantile:float
preference:float
max_iter:int
class affinity_job(clustering_job):
def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True):
super().__init__(infile,
outpath,
name,
call=self._affinity_clustering,
preference_quantile=preference_quantile,
damping=damping,
max_iter=max_iter,
convergence_iter=convergence_iter,
random_state=1968,
verbose=verbose)
self.damping=damping
self.max_iter=max_iter
self.convergence_iter=convergence_iter
self.preference_quantile=preference_quantile
def _affinity_clustering(self, mat, preference_quantile, *args, **kwargs):
mat = 1-mat
preference = np.quantile(mat, preference_quantile)
self.preference = preference
print(f"preference is {preference}")
print("data loaded")
sys.stdout.flush()
clustering = AffinityPropagation(*args,
preference=preference,
affinity='precomputed',
copy=False,
**kwargs).fit(mat)
return clustering
def get_info(self):
result = super().get_info()
self.result=affinity_clustering_result(**result.__dict__,
damping=self.damping,
max_iter=self.max_iter,
convergence_iter=self.convergence_iter,
preference_quantile=self.preference_quantile,
preference=self.preference)
return self.result
class affinity_grid_sweep(grid_sweep):
def __init__(self,
inpath,
outpath,
*args,
**kwargs):
super().__init__(affinity_job,
_afffinity_grid_sweep,
inpath,
outpath,
self.namer,
*args,
**kwargs)
def namer(self,
damping,
max_iter,
convergence_iter,
preference_quantile):
return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}"
def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5],n_cores=10):
"""Run affinity clustering once or more with different parameters.
Usage:
affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv>
Keword arguments:
savefile: path to save the metadata and diagnostics
inpath: path to feather data containing a labeled matrix of subreddit similarities.
outpath: path to output fit kmeans clusterings.
dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering.
preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter.
convergence_iters:one or more integers of number of iterations without improvement before stopping.
max_iters: one or more numbers of different maximum interations.
"""
obj = affinity_grid_sweep(inpath,
outpath,
map(float,dampings),
map(int,max_iters),
map(int,convergence_iters),
map(float,preference_quantiles))
obj.run(n_cores)
obj.save(savefile)
def test_select_affinity_clustering():
# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
# "test_hdbscan_author30k",
# min_cluster_sizes=[2],
# min_samples=[1,2],
# cluster_selection_epsilons=[0,0.05,0.1,0.15],
# cluster_selection_methods=['eom','leaf'],
# lsi_dimensions='all')
inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
outpath = "test_affinity";
dampings=[0.8,0.9]
max_iters=[100000]
convergence_iters=[15]
preference_quantiles=[0.5,0.7]
gs = affinity_lsi_grid_sweep(inpath, 'all', outpath, dampings, max_iters, convergence_iters, preference_quantiles)
gs.run(20)
gs.save("test_affinity/lsi_sweep.csv")
if __name__ == "__main__":
fire.Fire(run_affinity_grid_sweep)

View File

@@ -0,0 +1,99 @@
import fire
from affinity_clustering import affinity_clustering_result, affinity_job, affinity_grid_sweep
from grid_sweep import grid_sweep
from lsi_base import lsi_result_mixin, lsi_grid_sweep, lsi_mixin
from dataclasses import dataclass
@dataclass
class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin):
pass
class affinity_lsi_job(affinity_job, lsi_mixin):
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
super().__init__(infile,
outpath,
name,
*args,
**kwargs)
super().set_lsi_dims(lsi_dims)
def get_info(self):
result = super().get_info()
self.result = affinity_clustering_result_lsi(**result.__dict__,
lsi_dimensions=self.lsi_dims)
return self.result
class affinity_lsi_grid_sweep(lsi_grid_sweep):
def __init__(self,
inpath,
lsi_dims,
outpath,
dampings=[0.9],
max_iters=[10000],
convergence_iters=[30],
preference_quantiles=[0.5]):
super().__init__(affinity_lsi_job,
_affinity_lsi_grid_sweep,
inpath,
lsi_dims,
outpath,
dampings,
max_iters,
convergence_iters,
preference_quantiles)
class _affinity_lsi_grid_sweep(grid_sweep):
def __init__(self,
inpath,
outpath,
lsi_dim,
*args,
**kwargs):
self.lsi_dim = lsi_dim
self.jobtype = affinity_lsi_job
super().__init__(self.jobtype,
inpath,
outpath,
self.namer,
[self.lsi_dim],
*args,
**kwargs)
def namer(self, *args, **kwargs):
s = affinity_grid_sweep.namer(self, *args[1:], **kwargs)
s += f"_lsi-{self.lsi_dim}"
return s
def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all',n_cores=30):
"""Run affinity clustering once or more with different parameters.
Usage:
affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
Keword arguments:
savefile: path to save the metadata and diagnostics
inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
outpath: path to output fit kmeans clusterings.
dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering.
preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter.
convergence_iters:one or more integers of number of iterations without improvement before stopping.
max_iters: one or more numbers of different maximum interations.
lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
"""
obj = affinity_lsi_grid_sweep(inpath,
lsi_dimensions,
outpath,
map(float,dampings),
map(int,max_iters),
map(int,convergence_iters),
map(float,preference_quantiles))
obj.run(n_cores)
obj.save(savefile)
if __name__ == "__main__":
fire.Fire(run_affinity_lsi_grid_sweep)

View File

@@ -3,7 +3,7 @@
import sys import sys
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from sklearn.cluster import AffinityPropagation, KMeans from sklearn.cluster import AffinityPropagation
import fire import fire
from pathlib import Path from pathlib import Path
from multiprocessing import cpu_count from multiprocessing import cpu_count
@@ -46,24 +46,6 @@ def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000,
print(f"saved {output}") print(f"saved {output}")
return clustering return clustering
def kmeans_clustering(similarities, *args, **kwargs):
subreddits, mat = read_similarity_mat(similarities)
mat = sim_to_dist(mat)
clustering = _kmeans_clustering(mat, *args, **kwargs)
cluster_data = process_clustering_result(clustering, subreddits)
return(cluster_data)
def _kmeans_clustering(mat, output, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True):
clustering = KMeans(n_clusters=n_clusters,
n_init=n_init,
max_iter=max_iter,
random_state=random_state,
verbose=verbose
).fit(mat)
return clustering
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -1,49 +1,151 @@
import pickle
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from dataclasses import dataclass from dataclasses import dataclass
from sklearn.metrics import silhouette_score, silhouette_samples
from collections import Counter
def sim_to_dist(mat): # this is meant to be an interface, not created directly
dist = 1-mat class clustering_job:
dist[dist < 0] = 0 def __init__(self, infile, outpath, name, call, *args, **kwargs):
np.fill_diagonal(dist,0) self.outpath = Path(outpath)
return dist self.call = call
self.args = args
self.kwargs = kwargs
self.infile = Path(infile)
self.name = name
self.hasrun = False
def process_clustering_result(clustering, subreddits): def run(self):
self.subreddits, self.mat = self.read_distance_mat(self.infile)
self.clustering = self.call(self.mat, *self.args, **self.kwargs)
self.cluster_data = self.process_clustering(self.clustering, self.subreddits)
self.outpath.mkdir(parents=True, exist_ok=True)
self.cluster_data.to_feather(self.outpath/(self.name + ".feather"))
self.hasrun = True
self.cleanup()
def cleanup(self):
self.cluster_data = None
self.mat = None
self.clustering=None
self.subreddits=None
def get_info(self):
if not self.hasrun:
self.run()
self.result = clustering_result(outpath=str(self.outpath.resolve()),
silhouette_score=self.score,
name=self.name,
n_clusters=self.n_clusters,
n_isolates=self.n_isolates,
silhouette_samples = self.silsampout
)
return self.result
def silhouette(self):
counts = Counter(self.clustering.labels_)
singletons = [key for key, value in counts.items() if value == 1]
isolates = (self.clustering.labels_ == -1) | (np.isin(self.clustering.labels_,np.array(singletons)))
scoremat = self.mat[~isolates][:,~isolates]
if self.n_clusters > 1:
score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed')
silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed')
silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp})
self.outpath.mkdir(parents=True, exist_ok=True)
silsampout = self.outpath / ("silhouette_samples-" + self.name + ".feather")
self.silsampout = silsampout.resolve()
silhouette_samp.to_feather(self.silsampout)
else:
score = None
self.silsampout = None
return score
def read_distance_mat(self, similarities, use_threads=True):
print(similarities)
df = pd.read_feather(similarities, use_threads=use_threads)
mat = np.array(df.drop('_subreddit',axis=1))
n = mat.shape[0]
mat[range(n),range(n)] = 1
return (df._subreddit,1-mat)
def process_clustering(self, clustering, subreddits):
if hasattr(clustering,'n_iter_'): if hasattr(clustering,'n_iter_'):
print(f"clustering took {clustering.n_iter_} iterations") print(f"clustering took {clustering.n_iter_} iterations")
clusters = clustering.labels_ clusters = clustering.labels_
self.n_clusters = len(set(clusters))
print(f"found {len(set(clusters))} clusters") print(f"found {self.n_clusters} clusters")
cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_}) cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_})
self.score = self.silhouette()
print(f"silhouette_score:{self.score}")
cluster_sizes = cluster_data.groupby("cluster").count().reset_index() cluster_sizes = cluster_data.groupby("cluster").count().reset_index()
print(f"the largest cluster has {cluster_sizes.loc[cluster_sizes.cluster!=-1].subreddit.max()} members") print(f"the largest cluster has {cluster_sizes.loc[cluster_sizes.cluster!=-1].subreddit.max()} members")
print(f"the median cluster has {cluster_sizes.subreddit.median()} members") print(f"the median cluster has {cluster_sizes.subreddit.median()} members")
n_isolates1 = (cluster_sizes.subreddit==1).sum()
print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member") print(f"{n_isolates1} clusters have 1 member")
print(f"{(cluster_sizes.loc[cluster_sizes.cluster==-1,['subreddit']])} subreddits are in cluster -1",flush=True) n_isolates2 = cluster_sizes.loc[cluster_sizes.cluster==-1,:]['subreddit'].to_list()
if len(n_isolates2) > 0:
n_isloates2 = n_isolates2[0]
print(f"{n_isolates2} subreddits are in cluster -1",flush=True)
if n_isolates1 == 0:
self.n_isolates = n_isolates2
else:
self.n_isolates = n_isolates1
return cluster_data return cluster_data
class twoway_clustering_job(clustering_job):
def __init__(self, infile, outpath, name, call1, call2, args1, args2):
self.outpath = Path(outpath)
self.call1 = call1
self.args1 = args1
self.call2 = call2
self.args2 = args2
self.infile = Path(infile)
self.name = name
self.hasrun = False
self.args = args1|args2
def run(self):
self.subreddits, self.mat = self.read_distance_mat(self.infile)
self.step1 = self.call1(self.mat, **self.args1)
self.clustering = self.call2(self.mat, self.step1, **self.args2)
self.cluster_data = self.process_clustering(self.clustering, self.subreddits)
self.hasrun = True
self.after_run()
self.cleanup()
def after_run(self):
self.score = self.silhouette()
self.outpath.mkdir(parents=True, exist_ok=True)
print(self.outpath/(self.name+".feather"))
self.cluster_data.to_feather(self.outpath/(self.name + ".feather"))
def cleanup(self):
super().cleanup()
self.step1 = None
@dataclass @dataclass
class clustering_result: class clustering_result:
outpath:Path outpath:Path
max_iter:int
silhouette_score:float silhouette_score:float
alt_silhouette_score:float
name:str name:str
n_clusters:int n_clusters:int
n_isolates:int
def read_similarity_mat(similarities, use_threads=True): silhouette_samples:str
df = pd.read_feather(similarities, use_threads=use_threads)
mat = np.array(df.drop('_subreddit',1))
n = mat.shape[0]
mat[range(n),range(n)] = 1
return (df._subreddit,mat)

View File

@@ -1,34 +0,0 @@
import fire
import pyarrow
import pandas as pd
from numpy import random
import numpy as np
from sklearn.manifold import TSNE
similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet"
def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=10000, early_exaggeration=20):
'''
similarities: feather file with a dataframe of similarity scores
learning_rate: parameter controlling how fast the model converges. Too low and you get outliers. Too high and you get a ball.
perplexity: number of neighbors to use. the default of 50 is often good.
'''
df = pd.read_feather(similarities)
n = df.shape[0]
mat = np.array(df.drop('subreddit',1),dtype=np.float64)
mat[range(n),range(n)] = 1
mat[mat > 1] = 1
dist = 2*np.arccos(mat)/np.pi
tsne_model = TSNE(2,learning_rate=750,perplexity=50,n_iter=10000,metric='precomputed',early_exaggeration=20,n_jobs=-1)
tsne_fit_model = tsne_model.fit(dist)
tsne_fit_whole = tsne_fit_model.fit_transform(dist)
plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], 'subreddit':df.subreddit})
plot_data.to_feather(output)
if __name__ == "__main__":
fire.Fire(fit_tsne)

49
clustering/grid_sweep.py Normal file
View File

@@ -0,0 +1,49 @@
from pathlib import Path
from multiprocessing import Pool, cpu_count
from itertools import product, chain
import pandas as pd
class grid_sweep:
def __init__(self, jobtype, inpath, outpath, namer, *args):
self.jobtype = jobtype
self.namer = namer
print(*args)
grid = list(product(*args))
inpath = Path(inpath)
outpath = Path(outpath)
self.hasrun = False
self.grid = [(inpath,outpath,namer(*g)) + g for g in grid]
self.jobs = [jobtype(*g) for g in self.grid]
def run(self, cores=20):
if cores is not None and cores > 1:
with Pool(cores) as pool:
infos = pool.map(self.jobtype.get_info, self.jobs)
else:
infos = map(self.jobtype.get_info, self.jobs)
self.infos = pd.DataFrame(infos)
self.hasrun = True
def save(self, outcsv):
if not self.hasrun:
self.run()
outcsv = Path(outcsv)
outcsv.parent.mkdir(parents=True, exist_ok=True)
self.infos.to_csv(outcsv)
class twoway_grid_sweep(grid_sweep):
def __init__(self, jobtype, inpath, outpath, namer, args1, args2, *args, **kwargs):
self.jobtype = jobtype
self.namer = namer
prod1 = product(* args1.values())
prod2 = product(* args2.values())
grid1 = [dict(zip(args1.keys(), pargs)) for pargs in prod1]
grid2 = [dict(zip(args2.keys(), pargs)) for pargs in prod2]
grid = product(grid1, grid2)
inpath = Path(inpath)
outpath = Path(outpath)
self.hasrun = False
self.grid = [(inpath,outpath,namer(**(g[0] | g[1])), g[0], g[1], *args) for g in grid]
self.jobs = [jobtype(*g) for g in self.grid]

View File

@@ -1,32 +1,57 @@
from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat from clustering_base import clustering_result, clustering_job
from grid_sweep import grid_sweep
from dataclasses import dataclass from dataclasses import dataclass
import hdbscan import hdbscan
from sklearn.neighbors import NearestNeighbors from sklearn.neighbors import NearestNeighbors
import plotnine as pn import plotnine as pn
import numpy as np import numpy as np
from itertools import product, starmap from itertools import product, starmap, chain
import pandas as pd import pandas as pd
from sklearn.metrics import silhouette_score, silhouette_samples from multiprocessing import cpu_count
from pathlib import Path
from multiprocessing import Pool, cpu_count
import fire import fire
from pyarrow.feather import write_feather
def test_select_hdbscan_clustering(): def test_select_hdbscan_clustering():
select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
"test_hdbscan_author30k", # "test_hdbscan_author30k",
min_cluster_sizes=[2], # min_cluster_sizes=[2],
min_samples=[1,2], # min_samples=[1,2],
cluster_selection_epsilons=[0,0.05,0.1,0.15], # cluster_selection_epsilons=[0,0.05,0.1,0.15],
cluster_selection_methods=['eom','leaf'], # cluster_selection_methods=['eom','leaf'],
lsi_dimensions='all') # lsi_dimensions='all')
inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI" inpath = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_authors_compex_LSI"
outpath = "test_hdbscan"; outpath = "test_hdbscan";
min_cluster_sizes=[2,3,4]; min_cluster_sizes=[2,3,4];
min_samples=[1,2,3]; min_samples=[1,2,3];
cluster_selection_epsilons=[0,0.1,0.3,0.5]; cluster_selection_epsilons=[0,0.1,0.3,0.5];
cluster_selection_methods=['eom']; cluster_selection_methods=[1];
lsi_dimensions='all' lsi_dimensions='all'
gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods)
gs.run(20)
gs.save("test_hdbscan/lsi_sweep.csv")
# job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom')
# job1.run()
# print(job1.get_info())
# df = pd.read_csv("test_hdbscan/selection_data.csv")
# test_select_hdbscan_clustering()
# check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
# silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
# c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering)
class hdbscan_grid_sweep(grid_sweep):
def __init__(self,
inpath,
outpath,
*args,
**kwargs):
super().__init__(hdbscan_job, inpath, outpath, self.namer, *args, **kwargs)
def namer(self,
min_cluster_size,
min_samples,
cluster_selection_epsilon,
cluster_selection_method):
return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
@dataclass @dataclass
class hdbscan_clustering_result(clustering_result): class hdbscan_clustering_result(clustering_result):
@@ -34,101 +59,31 @@ class hdbscan_clustering_result(clustering_result):
min_samples:int min_samples:int
cluster_selection_epsilon:float cluster_selection_epsilon:float
cluster_selection_method:str cluster_selection_method:str
lsi_dimensions:int
n_isolates:int
silhouette_samples:str
def select_hdbscan_clustering(inpath, class hdbscan_job(clustering_job):
def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
super().__init__(infile,
outpath, outpath,
outfile=None, name,
min_cluster_sizes=[2], call=hdbscan_job._hdbscan_clustering,
min_samples=[1],
cluster_selection_epsilons=[0],
cluster_selection_methods=['eom'],
lsi_dimensions='all'
):
inpath = Path(inpath)
outpath = Path(outpath)
outpath.mkdir(exist_ok=True, parents=True)
if lsi_dimensions == 'all':
lsi_paths = list(inpath.glob("*"))
else:
lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
lsi_nums = [p.stem for p in lsi_paths]
grid = list(product(lsi_nums,
min_cluster_sizes,
min_samples,
cluster_selection_epsilons,
cluster_selection_methods))
# fix the output file names
names = list(map(lambda t:'_'.join(map(str,t)),grid))
grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
with Pool(int(cpu_count()/4)) as pool:
mods = starmap(hdbscan_clustering, grid)
res = pd.DataFrame(mods)
if outfile is None:
outfile = outpath / "selection_data.csv"
res.to_csv(outfile)
def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
subreddits, mat = read_similarity_mat(similarities)
mat = sim_to_dist(mat)
clustering = _hdbscan_clustering(mat,
min_cluster_size=min_cluster_size, min_cluster_size=min_cluster_size,
min_samples=min_samples, min_samples=min_samples,
cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_epsilon=cluster_selection_epsilon,
cluster_selection_method=cluster_selection_method, cluster_selection_method=cluster_selection_method
metric='precomputed',
core_dist_n_jobs=cpu_count()
) )
cluster_data = process_clustering_result(clustering, subreddits) self.min_cluster_size = min_cluster_size
isolates = clustering.labels_ == -1 self.min_samples = min_samples
scoremat = mat[~isolates][:,~isolates] self.cluster_selection_epsilon = cluster_selection_epsilon
score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed') self.cluster_selection_method = cluster_selection_method
cluster_data.to_feather(output) # self.mat = 1 - self.mat
silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
silsampout = output.parent / ("silhouette_samples" + output.name)
silhouette_samp.to_feather(silsampout)
result = hdbscan_clustering_result(outpath=output,
max_iter=None,
silhouette_samples=silsampout,
silhouette_score=score,
alt_silhouette_score=score,
name=name,
min_cluster_size=min_cluster_size,
min_samples=min_samples,
cluster_selection_epsilon=cluster_selection_epsilon,
cluster_selection_method=cluster_selection_method,
lsi_dimensions=lsi_dim,
n_isolates=isolates.sum(),
n_clusters=len(set(clustering.labels_))
)
return(result)
# for all runs we should try cluster_selection_epsilon = None
# for terms we should try cluster_selection_epsilon around 0.56-0.66
# for authors we should try cluster_selection_epsilon around 0.98-0.99
def _hdbscan_clustering(mat, *args, **kwargs): def _hdbscan_clustering(mat, *args, **kwargs):
print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}") print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
print(mat) print(mat)
clusterer = hdbscan.HDBSCAN(*args, clusterer = hdbscan.HDBSCAN(metric='precomputed',
core_dist_n_jobs=cpu_count(),
*args,
**kwargs, **kwargs,
) )
@@ -136,6 +91,39 @@ def _hdbscan_clustering(mat, *args, **kwargs):
return(clustering) return(clustering)
def get_info(self):
result = super().get_info()
self.result = hdbscan_clustering_result(**result.__dict__,
min_cluster_size=self.min_cluster_size,
min_samples=self.min_samples,
cluster_selection_epsilon=self.cluster_selection_epsilon,
cluster_selection_method=self.cluster_selection_method)
return self.result
def run_hdbscan_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
"""Run hdbscan clustering once or more with different parameters.
Usage:
hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=<csv "eom"|"leaf">
Keword arguments:
savefile: path to save the metadata and diagnostics
inpath: path to feather data containing a labeled matrix of subreddit similarities.
outpath: path to output fit kmeans clusterings.
min_cluster_sizes: one or more integers indicating the minumum cluster size
min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan
cluster_selection_method: "eom" or "leaf" eom gives larger clusters.
"""
obj = hdbscan_grid_sweep(inpath,
outpath,
map(int,min_cluster_sizes),
map(int,min_samples),
map(float,cluster_selection_epsilons),
cluster_selection_methods)
obj.run()
obj.save(savefile)
def KNN_distances_plot(mat,outname,k=2): def KNN_distances_plot(mat,outname,k=2):
nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat) nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
distances, indices = nbrs.kneighbors(mat) distances, indices = nbrs.kneighbors(mat)
@@ -165,8 +153,7 @@ def make_KNN_plots():
KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
if __name__ == "__main__": if __name__ == "__main__":
df = pd.read_csv("test_hdbscan/selection_data.csv") fire.Fire(run_hdbscan_grid_sweep)
test_select_hdbscan_clustering()
check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") # test_select_hdbscan_clustering()
silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") #fire.Fire(select_hdbscan_clustering)
c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering)

View File

@@ -0,0 +1,101 @@
from hdbscan_clustering import hdbscan_job, hdbscan_grid_sweep, hdbscan_clustering_result
from lsi_base import lsi_grid_sweep, lsi_mixin, lsi_result_mixin
from grid_sweep import grid_sweep
import fire
from dataclasses import dataclass
@dataclass
class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
pass
class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
super().__init__(
infile,
outpath,
name,
*args,
**kwargs)
super().set_lsi_dims(lsi_dims)
def get_info(self):
partial_result = super().get_info()
self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
lsi_dimensions=self.lsi_dims)
return self.result
class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
def __init__(self,
inpath,
lsi_dims,
outpath,
min_cluster_sizes,
min_samples,
cluster_selection_epsilons,
cluster_selection_methods
):
super().__init__(hdbscan_lsi_job,
_hdbscan_lsi_grid_sweep,
inpath,
lsi_dims,
outpath,
min_cluster_sizes,
min_samples,
cluster_selection_epsilons,
cluster_selection_methods)
class _hdbscan_lsi_grid_sweep(grid_sweep):
def __init__(self,
inpath,
outpath,
lsi_dim,
*args,
**kwargs):
print(args)
print(kwargs)
self.lsi_dim = lsi_dim
self.jobtype = hdbscan_lsi_job
super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
def namer(self, *args, **kwargs):
s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
s += f"_lsi-{self.lsi_dim}"
return s
def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=[1],lsi_dimensions='all'):
"""Run hdbscan clustering once or more with different parameters.
Usage:
hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
Keword arguments:
savefile: path to save the metadata and diagnostics
inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
outpath: path to output fit clusterings.
min_cluster_sizes: one or more integers indicating the minumum cluster size
min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan
cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters.
lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
"""
obj = hdbscan_lsi_grid_sweep(inpath,
lsi_dimensions,
outpath,
list(map(int,min_cluster_sizes)),
list(map(int,min_samples)),
list(map(float,cluster_selection_epsilons)),
cluster_selection_methods)
obj.run(10)
obj.save(savefile)
if __name__ == "__main__":
fire.Fire(run_hdbscan_lsi_grid_sweep)

View File

@@ -0,0 +1,105 @@
from sklearn.cluster import KMeans
import fire
from pathlib import Path
from dataclasses import dataclass
from clustering_base import clustering_result, clustering_job
from grid_sweep import grid_sweep
@dataclass
class kmeans_clustering_result(clustering_result):
n_clusters:int
n_init:int
max_iter:int
class kmeans_job(clustering_job):
def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True):
super().__init__(infile,
outpath,
name,
call=kmeans_job._kmeans_clustering,
n_clusters=n_clusters,
n_init=n_init,
max_iter=max_iter,
random_state=random_state,
verbose=verbose)
self.n_clusters=n_clusters
self.n_init=n_init
self.max_iter=max_iter
def _kmeans_clustering(mat, *args, **kwargs):
clustering = KMeans(*args,
**kwargs,
).fit(mat)
return clustering
def get_info(self):
result = super().get_info()
self.result = kmeans_clustering_result(**result.__dict__,
n_init=self.n_init,
max_iter=self.max_iter)
return self.result
class kmeans_grid_sweep(grid_sweep):
def __init__(self,
inpath,
outpath,
*args,
**kwargs):
super().__init__(kmeans_job, inpath, outpath, self.namer, *args, **kwargs)
def namer(self,
n_clusters,
n_init,
max_iter):
return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}"
def test_select_kmeans_clustering():
inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
outpath = "test_kmeans";
n_clusters=[200,300,400];
n_init=[1,2,3];
max_iter=[100000]
gs = kmeans_lsi_grid_sweep(inpath, 'all', outpath, n_clusters, n_init, max_iter)
gs.run(1)
cluster_selection_epsilons=[0,0.1,0.3,0.5];
cluster_selection_methods=['eom'];
lsi_dimensions='all'
gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods)
gs.run(20)
gs.save("test_hdbscan/lsi_sweep.csv")
def run_kmeans_grid_sweep(savefile, inpath, outpath, n_clusters=[500], n_inits=[1], max_iters=[3000]):
"""Run kmeans clustering once or more with different parameters.
Usage:
kmeans_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv>
Keword arguments:
savefile: path to save the metadata and diagnostics
inpath: path to feather data containing a labeled matrix of subreddit similarities.
outpath: path to output fit kmeans clusterings.
n_clusters: one or more numbers of kmeans clusters to select.
n_inits: one or more numbers of different initializations to use for each clustering.
max_iters: one or more numbers of different maximum interations.
"""
obj = kmeans_grid_sweep(inpath,
outpath,
map(int,n_clusters),
map(int,n_inits),
map(int,max_iters))
obj.run(1)
obj.save(savefile)
if __name__ == "__main__":
fire.Fire(run_kmeans_grid_sweep)

View File

@@ -0,0 +1,93 @@
import fire
from dataclasses import dataclass
from kmeans_clustering import kmeans_job, kmeans_clustering_result, kmeans_grid_sweep
from lsi_base import lsi_mixin, lsi_result_mixin, lsi_grid_sweep
from grid_sweep import grid_sweep
@dataclass
class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin):
pass
class kmeans_lsi_job(kmeans_job, lsi_mixin):
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
super().__init__(infile,
outpath,
name,
*args,
**kwargs)
super().set_lsi_dims(lsi_dims)
def get_info(self):
result = super().get_info()
self.result = kmeans_clustering_result_lsi(**result.__dict__,
lsi_dimensions=self.lsi_dims)
return self.result
class _kmeans_lsi_grid_sweep(grid_sweep):
def __init__(self,
inpath,
outpath,
lsi_dim,
*args,
**kwargs):
print(args)
print(kwargs)
self.lsi_dim = lsi_dim
self.jobtype = kmeans_lsi_job
super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
def namer(self, *args, **kwargs):
s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
s += f"_lsi-{self.lsi_dim}"
return s
class kmeans_lsi_grid_sweep(lsi_grid_sweep):
def __init__(self,
inpath,
lsi_dims,
outpath,
n_clusters,
n_inits,
max_iters
):
super().__init__(kmeans_lsi_job,
_kmeans_lsi_grid_sweep,
inpath,
lsi_dims,
outpath,
n_clusters,
n_inits,
max_iters)
def run_kmeans_lsi_grid_sweep(savefile, inpath, outpath, n_clusters=[500], n_inits=[1], max_iters=[3000], lsi_dimensions="all"):
"""Run kmeans clustering once or more with different parameters.
Usage:
kmeans_clustering_lsi.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH d--lsi_dimensions=<"all"|csv number of LSI dimensions to use> --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv>
Keword arguments:
savefile: path to save the metadata and diagnostics
inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
outpath: path to output fit kmeans clusterings.
lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
n_clusters: one or more numbers of kmeans clusters to select.
n_inits: one or more numbers of different initializations to use for each clustering.
max_iters: one or more numbers of different maximum interations.
"""
obj = kmeans_lsi_grid_sweep(inpath,
lsi_dimensions,
outpath,
list(map(int,n_clusters)),
list(map(int,n_inits)),
list(map(int,max_iters))
)
obj.run(1)
obj.save(savefile)
if __name__ == "__main__":
fire.Fire(run_kmeans_lsi_grid_sweep)

44
clustering/lsi_base.py Normal file
View File

@@ -0,0 +1,44 @@
from clustering_base import clustering_job, clustering_result
from grid_sweep import grid_sweep, twoway_grid_sweep
from dataclasses import dataclass
from itertools import chain
from pathlib import Path
class lsi_mixin():
def set_lsi_dims(self, lsi_dims):
self.lsi_dims = lsi_dims
@dataclass
class lsi_result_mixin:
lsi_dimensions:int
class lsi_grid_sweep(grid_sweep):
def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs):
self.jobtype = jobtype
self.subsweep = subsweep
inpath = Path(inpath)
if lsi_dimensions == 'all':
lsi_paths = list(inpath.glob("*.feather"))
else:
lsi_paths = [inpath / (str(dim) + '.feather') for dim in lsi_dimensions]
print(lsi_paths)
lsi_nums = [int(p.stem) for p in lsi_paths]
self.hasrun = False
self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
class twoway_lsi_grid_sweep(twoway_grid_sweep):
def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, args1, args2):
self.jobtype = jobtype
self.subsweep = subsweep
inpath = Path(inpath)
if lsi_dimensions == 'all':
lsi_paths = list(inpath.glob("*.feather"))
else:
lsi_paths = [inpath / (str(dim) + '.feather') for dim in lsi_dimensions]
lsi_nums = [int(p.stem) for p in lsi_paths]
self.hasrun = False
self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, args1, args2) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))

View File

@@ -0,0 +1,33 @@
#!/usr/bin/env python3
import fire
import pandas as pd
from pathlib import Path
import shutil
selection_data="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/clustering/comment_authors_compex_LSI/selection_data.csv"
outpath = 'test_best.feather'
min_clusters=50; max_isolates=7500; min_cluster_size=2
# pick the best clustering according to silhouette score subject to contraints
def pick_best_clustering(selection_data, output, min_clusters, max_isolates, min_cluster_size):
df = pd.read_csv(selection_data,index_col=0)
df = df.sort_values("silhouette_score",ascending=False)
# not sure I fixed the bug underlying this fully or not.
df['n_isolates_str'] = df.n_isolates.str.strip("[]")
df['n_isolates_0'] = df['n_isolates_str'].apply(lambda l: len(l) == 0)
df.loc[df.n_isolates_0,'n_isolates'] = 0
df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l))
best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)&(df.min_cluster_size==min_cluster_size)]
best_cluster = best_cluster.iloc[0]
best_lsi_dimensions = best_cluster.lsi_dimensions
print(best_cluster.to_dict())
best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather")
shutil.copy(best_path,output)
print(f"lsi dimensions:{best_lsi_dimensions}")
if __name__ == "__main__":
fire.Fire(pick_best_clustering)

View File

@@ -1,132 +0,0 @@
from sklearn.metrics import silhouette_score
from sklearn.cluster import AffinityPropagation
from functools import partial
from dataclasses import dataclass
from clustering import _affinity_clustering, read_similarity_mat, sim_to_dist, process_clustering_result, clustering_result
from multiprocessing import Pool, cpu_count, Array, Process
from pathlib import Path
from itertools import product, starmap
import numpy as np
import pandas as pd
import fire
import sys
# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying.
@dataclass
class affinity_clustering_result(clustering_result):
damping:float
convergence_iter:int
preference_quantile:float
def do_affinity_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False):
if name is None:
name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}"
print(name)
sys.stdout.flush()
outpath = outdir / (str(name) + ".feather")
outpath.parent.mkdir(parents=True,exist_ok=True)
print(outpath)
clustering = _affinity_clustering(mat, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose)
cluster_data = process_clustering_result(clustering, subreddits)
mat = sim_to_dist(clustering.affinity_matrix_)
try:
score = silhouette_score(mat, clustering.labels_, metric='precomputed')
except ValueError:
score = None
if alt_mat is not None:
alt_distances = sim_to_dist(alt_mat)
try:
alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
except ValueError:
alt_score = None
res = affinity_clustering_result(outpath=outpath,
damping=damping,
max_iter=max_iter,
convergence_iter=convergence_iter,
preference_quantile=preference_quantile,
silhouette_score=score,
alt_silhouette_score=score,
name=str(name))
return res
def do_affinity_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False):
if name is None:
name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}"
print(name)
sys.stdout.flush()
outpath = outdir / (str(name) + ".feather")
outpath.parent.mkdir(parents=True,exist_ok=True)
print(outpath)
clustering = _affinity_clustering(mat, subreddits, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose)
mat = sim_to_dist(clustering.affinity_matrix_)
try:
score = silhouette_score(mat, clustering.labels_, metric='precomputed')
except ValueError:
score = None
if alt_mat is not None:
alt_distances = sim_to_dist(alt_mat)
try:
alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
except ValueError:
alt_score = None
res = clustering_result(outpath=outpath,
damping=damping,
max_iter=max_iter,
convergence_iter=convergence_iter,
preference_quantile=preference_quantile,
silhouette_score=score,
alt_silhouette_score=score,
name=str(name))
return res
# alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering).
def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max_iter=100000, convergence_iter=[30], preference_quantile=[0.5], random_state=1968, verbose=True, alt_similarities=None, J=None):
damping = list(map(float,damping))
convergence_iter = convergence_iter = list(map(int,convergence_iter))
preference_quantile = list(map(float,preference_quantile))
if type(outdir) is str:
outdir = Path(outdir)
outdir.mkdir(parents=True,exist_ok=True)
subreddits, mat = read_similarity_mat(similarities,use_threads=True)
if alt_similarities is not None:
alt_mat = read_similarity_mat(alt_similarities,use_threads=True)
else:
alt_mat = None
if J is None:
J = cpu_count()
pool = Pool(J)
# get list of tuples: the combinations of hyperparameters
hyper_grid = product(damping, convergence_iter, preference_quantile)
hyper_grid = (t + (str(i),) for i, t in enumerate(hyper_grid))
_do_clustering = partial(do_affinity_clustering, mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat)
# similarities = Array('d', mat)
# call pool.starmap
print("running clustering selection")
clustering_data = pool.starmap(_do_clustering, hyper_grid)
clustering_data = pd.DataFrame(list(clustering_data))
clustering_data.to_csv(outinfo)
return clustering_data
if __name__ == "__main__":
x = fire.Fire(select_affinity_clustering)

View File

@@ -1,92 +0,0 @@
from sklearn.metrics import silhouette_score
from sklearn.cluster import AffinityPropagation
from functools import partial
from clustering import _kmeans_clustering, read_similarity_mat, sim_to_dist, process_clustering_result, clustering_result
from dataclasses import dataclass
from multiprocessing import Pool, cpu_count, Array, Process
from pathlib import Path
from itertools import product, starmap
import numpy as np
import pandas as pd
import fire
import sys
@dataclass
class kmeans_clustering_result(clustering_result):
n_clusters:int
n_init:int
# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying.
def do_clustering(n_clusters, n_init, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False):
if name is None:
name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}"
print(name)
sys.stdout.flush()
outpath = outdir / (str(name) + ".feather")
print(outpath)
mat = sim_to_dist(mat)
clustering = _kmeans_clustering(mat, outpath, n_clusters, n_init, max_iter, random_state, verbose)
outpath.parent.mkdir(parents=True,exist_ok=True)
cluster_data.to_feather(outpath)
cluster_data = process_clustering_result(clustering, subreddits)
try:
score = silhouette_score(mat, clustering.labels_, metric='precomputed')
except ValueError:
score = None
if alt_mat is not None:
alt_distances = sim_to_dist(alt_mat)
try:
alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
except ValueError:
alt_score = None
res = kmeans_clustering_result(outpath=outpath,
max_iter=max_iter,
n_clusters=n_clusters,
n_init = n_init,
silhouette_score=score,
alt_silhouette_score=score,
name=str(name))
return res
# alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering).
def select_kmeans_clustering(similarities, outdir, outinfo, n_clusters=[1000], max_iter=100000, n_init=10, random_state=1968, verbose=True, alt_similarities=None):
n_clusters = list(map(int,n_clusters))
n_init = list(map(int,n_init))
if type(outdir) is str:
outdir = Path(outdir)
outdir.mkdir(parents=True,exist_ok=True)
subreddits, mat = read_similarity_mat(similarities,use_threads=True)
if alt_similarities is not None:
alt_mat = read_similarity_mat(alt_similarities,use_threads=True)
else:
alt_mat = None
# get list of tuples: the combinations of hyperparameters
hyper_grid = product(n_clusters, n_init)
hyper_grid = (t + (str(i),) for i, t in enumerate(hyper_grid))
_do_clustering = partial(do_clustering, mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat)
# call starmap
print("running clustering selection")
clustering_data = starmap(_do_clustering, hyper_grid)
clustering_data = pd.DataFrame(list(clustering_data))
clustering_data.to_csv(outinfo)
return clustering_data
if __name__ == "__main__":
x = fire.Fire(select_kmeans_clustering)

View File

@@ -1,7 +1,38 @@
import fire import pandas as pd
from select_affinity import select_affinity_clustering import plotnine as pn
from select_kmeans import select_kmeans_clustering from pathlib import Path
from clustering.fit_tsne import fit_tsne
from visualization.tsne_vis import build_visualization
df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0)
# plot silhouette_score as a function of isolates
df = df.sort_values("silhouette_score")
df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1]))
p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point()
p.save("isolates_x_score.png")
p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point()
p.save("clusters_x_isolates.png")
# the best result for hdbscan seems like this one: it has a decent number of
# i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables
best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]]
best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]]
tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather")
if not tnse_data.exists():
fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather",
tnse_data)
build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
Path(best_eom.outpath)/(best_eom['name']+'.feather'),
"./authors-tf_lsi850_best_eom.html")
build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
Path(best_leaf.outpath)/(best_leaf['name']+'.feather'),
"./authors-tf_lsi850_best_leaf.html")
if __name__ == "__main__":
fire.Fire({"kmeans":select_kmeans_clustering,
"affinity":select_affinity_clustering})

4
clustering/validation.py Normal file
View File

@@ -0,0 +1,4 @@
from sklearn import metrics
from sklearn.cluster import AffinityPropagation
from functools import partial
# sillouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying.

28
datasets/Makefile Normal file
View File

@@ -0,0 +1,28 @@
all: ../../data/reddit_comments_by_subreddit.parquet ../../data/reddit_submissions_by_subreddit.parquet
../../data/reddit_comments_by_subreddit.parquet:../../data/temp/reddit_comments.parquet
../start_spark_and_run.sh 4 comments_2_parquet_part2.py
../../data/temp/reddit_comments.parquet: comments_task_list.sh run_comments_jobs.sbatch
mkdir -p comments_jobs
mkdir -p ../../data/temp/
sbatch --wait --array=1-$(shell cat comments_task_list.sh | wc -l) run_comments_jobs.sbatch 0
temp_reddit_comments.parquet: ../../data/temp/reddit_comments.parquet
comments_task_list.sh: comments_2_parquet_part1.py
srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 bash -c "source ~/.bashrc && python3 comments_2_parquet_part1.py gen_task_list --overwrite=False"
submissions_task_list.sh: submissions_2_parquet_part1.py
srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 python3 submissions_2_parquet_part1.py gen_task_list
../../data/reddit_submissions_by_subreddit.parquet:../../data/temp/reddit_submissions.parquet
../start_spark_and_run.sh 4 submissions_2_parquet_part2.py
../../data/temp/reddit_submissions.parquet: submissions_task_list.sh run_submissions_jobs.sbatch
mkdir -p submissions_jobs
rm -rf ../../data/temp/reddit_submissions.parquet
mkdir -p ../../data/temp/
sbatch --wait --array=1-$(shell cat submissions_task_list.sh | wc -l) run_submissions_jobs.sbatch 0
temp_reddit_submissions.parquet: ../../data/temp/reddit_submissions.parquet

View File

@@ -1,26 +0,0 @@
#!/bin/bash
## parallel_sql_job.sh
#SBATCH --job-name=tf_subreddit_comments
## Allocation Definition
#SBATCH --account=comdata-ckpt
#SBATCH --partition=ckpt
## Resources
## Nodes. This should always be 1 for parallel-sql.
#SBATCH --nodes=1
## Walltime (12 hours)
#SBATCH --time=12:00:00
## Memory per node
#SBATCH --mem=32G
#SBATCH --cpus-per-task=4
#SBATCH --ntasks=1
#SBATCH -D /gscratch/comdata/users/nathante/cdsc-reddit
source ./bin/activate
module load parallel_sql
echo $(which perl)
conda list pyarrow
which python3
#Put here commands to load other modules (e.g. matlab etc.)
#Below command means that parallel_sql will get tasks from the database
#and run them on the node (in parallel). So a 16 core node will have
#16 tasks running at one time.
parallel-sql --sql -a parallel --exit-on-term --jobs 4

View File

@@ -1,10 +1,10 @@
#!/usr/bin/env bash
## needs to be run by hand since i don't have a nice way of waiting on a parallel-sql job to complete ## needs to be run by hand since i don't have a nice way of waiting on a parallel-sql job to complete
#!/usr/bin/env bash
echo "#!/usr/bin/bash" > job_script.sh echo "#!/usr/bin/bash" > job_script.sh
#echo "source $(pwd)/../bin/activate" >> job_script.sh #echo "source $(pwd)/../bin/activate" >> job_script.sh
echo "python3 $(pwd)/comments_2_parquet_part1.py" >> job_script.sh echo "python3 $(pwd)/comments_2_parquet_part1.py" >> job_script.sh
srun -p comdata -A comdata --nodes=1 --mem=120G --time=48:00:00 --pty job_script.sh srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 --pty job_script.sh
start_spark_and_run.sh 1 $(pwd)/comments_2_parquet_part2.py start_spark_and_run.sh 1 $(pwd)/comments_2_parquet_part2.py

View File

@@ -1,12 +1,15 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os
import json import json
from datetime import datetime from datetime import datetime
from multiprocessing import Pool from multiprocessing import Pool
from itertools import islice from itertools import islice
from helper import find_dumps, open_fileset from helper import open_input_file, find_dumps
import pandas as pd import pandas as pd
import pyarrow as pa import pyarrow as pa
import pyarrow.parquet as pq import pyarrow.parquet as pq
from pathlib import Path
import fire
def parse_comment(comment, names= None): def parse_comment(comment, names= None):
if names is None: if names is None:
@@ -44,19 +47,14 @@ def parse_comment(comment, names= None):
return tuple(row) return tuple(row)
# conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','/gscratch/comdata/spark_tmp')]) # conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','../../data/spark_tmp')])
dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments/" def parse_dump(partition):
files = list(find_dumps(dumpdir, base_pattern="RC_20*")) dumpdir = f"../../data/reddit_dumps/comments/{partition}"
pool = Pool(28) stream = open_input_file(dumpdir)
rows = map(parse_comment, stream)
stream = open_fileset(files)
N = int(1e4)
rows = pool.imap_unordered(parse_comment, stream, chunksize=int(N/28))
schema = pa.schema([ schema = pa.schema([
pa.field('id', pa.string(), nullable=True), pa.field('id', pa.string(), nullable=True),
@@ -78,33 +76,16 @@ schema = pa.schema([
pa.field('error', pa.string(), nullable=True), pa.field('error', pa.string(), nullable=True),
]) ])
from pathlib import Path p = Path("../../data/temp/reddit_comments.parquet")
p = Path("/gscratch/comdata/output/reddit_comments.parquet_temp2") p.mkdir(exist_ok=True,parents=True)
if not p.is_dir(): N=10000
if p.exists(): with pq.ParquetWriter(f"../../data/temp/reddit_comments.parquet/{partition}.parquet",
p.unlink() schema=schema,
p.mkdir() compression='snappy',
flavor='spark') as writer:
else:
list(map(Path.unlink,p.glob('*')))
part_size = int(1e7)
part = 1
n_output = 0
writer = pq.ParquetWriter(f"/gscratch/comdata/output/reddit_comments.parquet_temp2/part_{part}.parquet",schema=schema,compression='snappy',flavor='spark')
while True: while True:
if n_output > part_size:
if part > 1:
writer.close()
part = part + 1
n_output = 0
writer = pq.ParquetWriter(f"/gscratch/comdata/output/reddit_comments.parquet_temp2/part_{part}.parquet",schema=schema,compression='snappy',flavor='spark')
n_output += N
chunk = islice(rows,N) chunk = islice(rows,N)
pddf = pd.DataFrame(chunk, columns=schema.names) pddf = pd.DataFrame(chunk, columns=schema.names)
table = pa.Table.from_pandas(pddf,schema=schema) table = pa.Table.from_pandas(pddf,schema=schema)
@@ -112,4 +93,19 @@ while True:
break break
writer.write_table(table) writer.write_table(table)
writer.close()
def gen_task_list(dumpdir="../../data/raw_data/reddit_dumps/comments", overwrite=True):
files = list(find_dumps(dumpdir,base_pattern="RC_20*.*"))
with open("comments_task_list.sh",'w') as of:
for fpath in files:
partition = os.path.split(fpath)[1]
if (not Path(f"../../data/temp/reddit_comments.parquet/{partition}.parquet").exists()) or (overwrite is True):
of.write(f'python3 comments_2_parquet_part1.py parse_dump {partition}\n')
if __name__ == '__main__':
fire.Fire({'parse_dump':parse_dump,
'gen_task_list':gen_task_list})

View File

@@ -2,12 +2,19 @@
# spark script to make sorted, and partitioned parquet files # spark script to make sorted, and partitioned parquet files
import pyspark
from pyspark.sql import functions as f from pyspark.sql import functions as f
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("/gscratch/comdata/output/reddit_comments.parquet_temp2",compression='snappy') conf = pyspark.SparkConf().setAppName("Reddit submissions to parquet")
conf = conf.set("spark.sql.shuffle.partitions",2400)
conf = conf.set('spark.sql.crossJoin.enabled',"true")
conf = conf.set('spark.debug.maxToStringFields',200)
sc = spark.sparkContext
df = spark.read.parquet("/gscratch/comdata/output/temp/reddit_comments.parquet",compression='snappy')
df = df.withColumn("subreddit_2", f.lower(f.col('subreddit'))) df = df.withColumn("subreddit_2", f.lower(f.col('subreddit')))
df = df.drop('subreddit') df = df.drop('subreddit')
@@ -18,12 +25,13 @@ df = df.withColumn("Month",f.month(f.col("CreatedAt")))
df = df.withColumn("Year",f.year(f.col("CreatedAt"))) df = df.withColumn("Year",f.year(f.col("CreatedAt")))
df = df.withColumn("Day",f.dayofmonth(f.col("CreatedAt"))) df = df.withColumn("Day",f.dayofmonth(f.col("CreatedAt")))
df = df.repartition('subreddit') # df = df.repartition(1200,'subreddit')
df2 = df.sort(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True) # df2 = df.sort(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True)
df2 = df2.sortWithinPartitions(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True) # df2 = df2.sortWithinPartitions(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True)
df2.write.parquet("/gscratch/comdata/users/nathante/reddit_comments_by_subreddit.parquet_new", mode='overwrite', compression='snappy') # df2.write.parquet("/gscratch/scrubbed/comdata/reddit_comments_by_subreddit.parquet", mode='overwrite', compression='snappy')
df = df.repartition('author') #df = spark.read.parquet("/gscratch/scrubbed/comdata/reddit_comments_by_subreddit.parquet")
df3 = df.sort(["author","CreatedAt","subreddit","link_id","parent_id","Year","Month","Day"],ascending=True) df = df.repartition(2400,'author','subreddit',"Year","Month","Day")
df3 = df3.sortWithinPartitions(["author","CreatedAt","subreddit","link_id","parent_id","Year","Month","Day"],ascending=True) df3 = df.sort(["author","subreddit","Year","Month","Day","CreatedAt","link_id","parent_id"],ascending=True)
df3.write.parquet("/gscratch/comdata/users/nathante/reddit_comments_by_author.parquet_new", mode='overwrite',compression='snappy') df3 = df3.sortWithinPartitions(["author","subreddit","Year","Month","Day","CreatedAt","link_id","parent_id"],ascending=True)
df3.write.parquet("/gscratch/scrubbed/comdata/reddit_comments_by_author.parquet", mode='overwrite',compression='snappy')

View File

@@ -24,8 +24,7 @@ def open_fileset(files):
for fh in files: for fh in files:
print(fh) print(fh)
lines = open_input_file(fh) lines = open_input_file(fh)
for line in lines: yield from lines
yield line
def open_input_file(input_filename): def open_input_file(input_filename):
if re.match(r'.*\.7z$', input_filename): if re.match(r'.*\.7z$', input_filename):
@@ -39,7 +38,7 @@ def open_input_file(input_filename):
elif re.match(r'.*\.xz', input_filename): elif re.match(r'.*\.xz', input_filename):
cmd = ["xzcat",'-dk', '-T 20',input_filename] cmd = ["xzcat",'-dk', '-T 20',input_filename]
elif re.match(r'.*\.zst',input_filename): elif re.match(r'.*\.zst',input_filename):
cmd = ['zstd','-dck', input_filename] cmd = ['/kloneusr/bin/zstd','-dck', input_filename, '--memory=2048MB --stdout']
elif re.match(r'.*\.gz',input_filename): elif re.match(r'.*\.gz',input_filename):
cmd = ['gzip','-dc', input_filename] cmd = ['gzip','-dc', input_filename]
try: try:

View File

@@ -1,4 +0,0 @@
#!/usr/bin/bash
start_spark_cluster.sh
spark-submit --master spark://$(hostname):18899 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/users/nathante/subreddit_term_similarity_weekly_5000.parquet --topN=5000
stop-all.sh

View File

@@ -0,0 +1,24 @@
#!/bin/bash
## tf reddit comments
#SBATCH --job-name="cdsc_reddit; parse comment dumps"
## Allocation Definition
#SBATCH --account=comdata
#SBATCH --partition=compute-bigmem
## Resources
## Nodes. This should always be 1 for parallel-sql.
#SBATCH --nodes=1
## Walltime (12 hours)
#SBATCH --time=24:00:00
## Memory per node
#SBATCH --mem=8G
#SBATCH --cpus-per-task=1
#SBATCH --ntasks=1
#SBATCH
#SBATCH --chdir /gscratch/comdata/users/nathante/partitioning_reddit/dataverse/cdsc_reddit/datasets
#SBATCH --output=comments_jobs/%A_%a.out
#SBATCH --error=comments_jobs/%A_%a.out
. /opt/ohpc/admin/lmod/lmod/init/profile
source ~/.bashrc
TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1))
TASK_CALL=$(sed -n ${TASK_NUM}p ./comments_task_list.sh)
${TASK_CALL}

View File

@@ -0,0 +1,23 @@
#!/bin/bash
## tf reddit comments
#SBATCH --job-name="cdsc_reddit; parse submission dumps"
## Allocation Definition
#SBATCH --account=comdata-ckpt
#SBATCH --partition=ckpt
## Resources
## Nodes. This should always be 1 for parallel-sql.
#SBATCH --nodes=1
## Walltime (12 hours)
#SBATCH --time=24:00:00
## Memory per node
#SBATCH --mem=8G
#SBATCH --cpus-per-task=1
#SBATCH --ntasks=1
#SBATCH
#SBATCH --chdir /gscratch/comdata/users/nathante/cdsc_reddit/datasets
#SBATCH --output=submissions_jobs/%A_%a.out
#SBATCH --error=submissions_jobs/%A_%a.out
TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1))
TASK_CALL=$(sed -n ${TASK_NUM}p ./submissions_task_list.sh)
${TASK_CALL}

4
datasets/submissions_2_parquet.sh Normal file → Executable file
View File

@@ -1,8 +1,8 @@
#!/usr/bin/env bash
## this should be run manually since we don't have a nice way to wait on parallel_sql jobs ## this should be run manually since we don't have a nice way to wait on parallel_sql jobs
#!/usr/bin/env bash
./parse_submissions.sh srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 python3 $(pwd)/submissions_2_parquet_part1.py gen_task_list
start_spark_and_run.sh 1 $(pwd)/submissions_2_parquet_part2.py start_spark_and_run.sh 1 $(pwd)/submissions_2_parquet_part2.py

View File

@@ -3,26 +3,23 @@
# two stages: # two stages:
# 1. from gz to arrow parquet (this script) # 1. from gz to arrow parquet (this script)
# 2. from arrow parquet to spark parquet (submissions_2_parquet_part2.py) # 2. from arrow parquet to spark parquet (submissions_2_parquet_part2.py)
from datetime import datetime from datetime import datetime
from multiprocessing import Pool from pathlib import Path
from itertools import islice from itertools import islice
from helper import find_dumps, open_fileset from helper import find_dumps, open_fileset
import pandas as pd import pandas as pd
import pyarrow as pa import pyarrow as pa
import pyarrow.parquet as pq import pyarrow.parquet as pq
import simdjson
import fire import fire
import os import os
import json
parser = simdjson.Parser()
def parse_submission(post, names = None): def parse_submission(post, names = None):
if names is None: if names is None:
names = ['id','author','subreddit','title','created_utc','permalink','url','domain','score','ups','downs','over_18','has_media','selftext','retrieved_on','num_comments','gilded','edited','time_edited','subreddit_type','subreddit_id','subreddit_subscribers','name','is_self','stickied','quarantine','error'] names = ['id','author','subreddit','title','created_utc','permalink','url','domain','score','ups','downs','over_18','has_media','selftext','retrieved_on','num_comments','gilded','edited','time_edited','subreddit_type','subreddit_id','subreddit_subscribers','name','is_self','stickied','quarantine','error']
try: try:
post = parser.parse(post) post = json.loads(post)
except (ValueError) as e: except (ValueError) as e:
# print(e) # print(e)
# print(post) # print(post)
@@ -61,7 +58,7 @@ def parse_submission(post, names = None):
def parse_dump(partition): def parse_dump(partition):
N=10000 N=10000
stream = open_fileset([f"/gscratch/comdata/raw_data/reddit_dumps/submissions/{partition}"]) stream = open_fileset([f"/gscratch/comdata/raw_data/submissions/{partition}"])
rows = map(parse_submission,stream) rows = map(parse_submission,stream)
schema = pa.schema([ schema = pa.schema([
pa.field('id', pa.string(),nullable=True), pa.field('id', pa.string(),nullable=True),
@@ -92,8 +89,7 @@ def parse_dump(partition):
pa.field('quarantine',pa.bool_(),nullable=True), pa.field('quarantine',pa.bool_(),nullable=True),
pa.field('error',pa.string(),nullable=True)]) pa.field('error',pa.string(),nullable=True)])
if not os.path.exists("/gscratch/comdata/output/temp/reddit_submissions.parquet/"): Path("/gscratch/comdata/output/temp/reddit_submissions.parquet/").mkdir(exist_ok=True,parents=True)
os.mkdir("/gscratch/comdata/output/temp/reddit_submissions.parquet/")
with pq.ParquetWriter(f"/gscratch/comdata/output/temp/reddit_submissions.parquet/{partition}",schema=schema,compression='snappy',flavor='spark') as writer: with pq.ParquetWriter(f"/gscratch/comdata/output/temp/reddit_submissions.parquet/{partition}",schema=schema,compression='snappy',flavor='spark') as writer:
while True: while True:
@@ -106,9 +102,9 @@ def parse_dump(partition):
writer.close() writer.close()
def gen_task_list(dumpdir="/gscratch/comdata/raw_data/reddit_dumps/submissions"): def gen_task_list(dumpdir="/gscratch/comdata/raw_data/submissions"):
files = list(find_dumps(dumpdir,base_pattern="RS_20*.*")) files = list(find_dumps(dumpdir,base_pattern="RS_20*.*"))
with open("parse_submissions_task_list",'w') as of: with open("submissions_task_list.sh",'w') as of:
for fpath in files: for fpath in files:
partition = os.path.split(fpath)[1] partition = os.path.split(fpath)[1]
of.write(f'python3 submissions_2_parquet_part1.py parse_dump {partition}\n') of.write(f'python3 submissions_2_parquet_part1.py parse_dump {partition}\n')

View File

@@ -29,14 +29,14 @@ df = df.withColumn("Day",f.dayofmonth(f.col("CreatedAt")))
df = df.withColumn("subreddit_hash",f.sha2(f.col("subreddit"), 256)[0:3]) df = df.withColumn("subreddit_hash",f.sha2(f.col("subreddit"), 256)[0:3])
# next we gotta resort it all. # next we gotta resort it all.
df = df.repartition("subreddit") df = df.repartition(800,"subreddit","Year","Month")
df2 = df.sort(["subreddit","CreatedAt","id"],ascending=True) df2 = df.sort(["subreddit","Year","Month","CreatedAt","id"],ascending=True)
df2 = df.sortWithinPartitions(["subreddit","CreatedAt","id"],ascending=True) df2 = df.sortWithinPartitions(["subreddit","CreatedAt","id"],ascending=True)
df2.write.parquet("/gscratch/comdata/output/temp/reddit_submissions_by_subreddit.parquet2", mode='overwrite',compression='snappy') df2.write.parquet("/gscratch/comdata/output/temp/reddit_submissions_by_subreddit.parquet2", mode='overwrite',compression='snappy')
# # we also want to have parquet files sorted by author then reddit. # # we also want to have parquet files sorted by author then reddit.
df = df.repartition("author") df = df.repartition(800,"author","subreddit","Year","Month")
df3 = df.sort(["author","CreatedAt","id"],ascending=True) df3 = df.sort(["author","Year","Month","CreatedAt","id"],ascending=True)
df3 = df.sortWithinPartitions(["author","CreatedAt","id"],ascending=True) df3 = df.sortWithinPartitions(["author","CreatedAt","id"],ascending=True)
df3.write.parquet("/gscratch/comdata/output/temp/reddit_submissions_by_author.parquet2", mode='overwrite',compression='snappy') df3.write.parquet("/gscratch/comdata/output/temp/reddit_submissions_by_author.parquet2", mode='overwrite',compression='snappy')

View File

@@ -1,10 +1,7 @@
all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscratch/comdata/output/reddit_density/comment_authors_10000.feather /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather all: ../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather
/gscratch/comdata/output/reddit_density/comment_terms_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather ../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py ../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather
start_spark_and_run.sh 1 overlap_density.py terms --inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather" --agg=pd.DataFrame.sum ../start_spark_and_run.sh 1 overlap_density.py authors --inpath="../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum
/gscratch/comdata/output/reddit_density/comment_authors_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather ../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather:
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather" --agg=pd.DataFrame.sum $(MAKE) -C ../similarities
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum

View File

@@ -1,4 +1,6 @@
#!/usr/bin/bash #!/usr/bin/bash
source ~/.bashrc
echo $(hostname)
start_spark_cluster.sh start_spark_cluster.sh
spark-submit --master spark://$(hostname):18899 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --outpath=/gscratch/comdata/output/reddit_density/comment_authors_10000.feather --agg=pd.DataFrame.sum spark-submit --verbose --master spark://$(hostname):43015 overlap_density.py authors --inpath=../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum
stop-all.sh stop-all.sh

View File

@@ -1,11 +1,12 @@
import pandas as pd import pandas as pd
from pandas.core.groupby import DataFrameGroupBy as GroupBy from pandas.core.groupby import DataFrameGroupBy as GroupBy
from pathlib import Path
import fire import fire
import numpy as np import numpy as np
import sys import sys
sys.path.append("..") # sys.path.append("..")
sys.path.append("../similarities") # sys.path.append("../similarities")
from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval # from similarities.similarities_helper import pull_tfidf
# this is the mean of the ratio of the overlap to the focal size. # this is the mean of the ratio of the overlap to the focal size.
# mean shared membership per focal community member # mean shared membership per focal community member
@@ -13,10 +14,12 @@ from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_i
def overlap_density(inpath, outpath, agg = pd.DataFrame.sum): def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
df = pd.read_feather(inpath) df = pd.read_feather(inpath)
df = df.drop('subreddit',1) df = df.drop('_subreddit',1)
np.fill_diagonal(df.values,0) np.fill_diagonal(df.values,0)
df = agg(df, 0).reset_index() df = agg(df, 0).reset_index()
df = df.rename({0:'overlap_density'},axis='columns') df = df.rename({0:'overlap_density'},axis='columns')
outpath = Path(outpath)
outpath.parent.mkdir(parents=True, exist_ok = True)
df.to_feather(outpath) df.to_feather(outpath)
return df return df
@@ -25,6 +28,8 @@ def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
# exclude the diagonal # exclude the diagonal
df = df.loc[df.subreddit != df.variable] df = df.loc[df.subreddit != df.variable]
res = agg(df.groupby(['subreddit','week'])).reset_index() res = agg(df.groupby(['subreddit','week'])).reset_index()
outpath = Path(outpath)
outpath.parent.mkdir(parents=True, exist_ok = True)
res.to_feather(outpath) res.to_feather(outpath)
return res return res

View File

@@ -6,9 +6,9 @@ from os import path
import hashlib import hashlib
shasums1 = requests.get("https://files.pushshift.io/reddit/comments/sha256sum.txt").text shasums1 = requests.get("https://files.pushshift.io/reddit/comments/sha256sum.txt").text
shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text #shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text
shasums = shasums1 + shasums2 shasums = shasums1
dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments" dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments"
for l in shasums.strip().split('\n'): for l in shasums.strip().split('\n'):

View File

@@ -1,14 +1,12 @@
#!/bin/bash #!/bin/bash
user_agent='nathante teblunthuis <nathante@uw.edu>' user_agent='"nathante teblunthuis <nathante@uw.edu>"'
output_dir='/gscratch/comdata/raw_data/reddit_dumps/comments' output_dir='/gscratch/comdata/raw_data/reddit_dumps/comments'
base_url='https://files.pushshift.io/reddit/comments/' base_url='https://files.pushshift.io/reddit/comments/'
wget -r --no-parent -A 'RC_201*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url wget -r --no-parent -A 'RC_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url
wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url wget -r --no-parent -A 'RC_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url
wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url wget -r --no-parent -A 'RC_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url
# starting in 2020 we use daily dumps not monthly dumps
wget -r --no-parent -A 'RC_202*.gz' -U $user_agent -P $output_dir -nd -nc $base_url/daily/
./check_comments_shas.py ./check_comments_shas.py

View File

@@ -1,14 +1,14 @@
#!/bin/bash #!/bin/bash
user_agent='nathante teblunthuis <nathante@uw.edu>' user_agent='"nathante teblunthuis <nathante@uw.edu>"'
output_dir='/gscratch/comdata/raw_data/reddit_dumps/submissions' output_dir='/gscratch/comdata/raw_data/reddit_dumps/submissions'
base_url='https://files.pushshift.io/reddit/submissions/' base_url='https://files.pushshift.io/reddit/submissions/'
wget -r --no-parent -A 'RS_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url wget -r --no-parent -A 'RS_20*.bz2' --user-agent=$user_agent -P $output_dir -nd -nc $base_url
wget -r --no-parent -A 'RS_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url wget -r --no-parent -A 'RS_20*.xz' --user-agent=$user_agent -P $output_dir -nd -nc $base_url
wget -r --no-parent -A 'RS_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url wget -r --no-parent -A 'RS_20*.zst' --user-agent=$user_agent -P $output_dir -nd -nc $base_url
wget -r --no-parent -A 'RS_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ wget -r --no-parent -A 'RS_20*.bz2' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
wget -r --no-parent -A 'RS_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ wget -r --no-parent -A 'RS_20*.xz' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
wget -r --no-parent -A 'RS_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ wget -r --no-parent -A 'RS_20*.zst' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
./check_submission_shas.py ./check_submission_shas.py

View File

@@ -0,0 +1,34 @@
from pathlib import Path
from itertools import chain, groupby
dumpdir = Path("/gscratch/comdata/raw_data/reddit_dumps/comments")
zst_files = dumpdir.glob("*.zst")
bz2_files = dumpdir.glob("*.bz2")
xz_files = dumpdir.glob("*.xz")
all_files = sorted(list(chain(zst_files, bz2_files, xz_files)))
groups = groupby(all_files, key = lambda p: p.stem)
kept_paths = []
removed_paths = []
priority = ['.zst','.xz','.bz2']
for stem, files in groups:
keep_file = None
remove_files = []
for f in files:
if keep_file is None:
keep_file = f
elif priority.index(keep_file.suffix) > priority.index(f.suffix):
remove_files.append(keep_file)
keep_file = f
else:
remove_files.append(f)
kept_paths.append(keep_file)
removed_paths.extend(remove_files)
(dumpdir / "to_remove").mkdir()
for f in removed_paths:
f.rename(f.parent / "to_remove" / f.name)

View File

@@ -0,0 +1,34 @@
from pathlib import Path
from itertools import chain, groupby
dumpdir = Path("/gscratch/comdata/raw_data/reddit_dumps/submissions")
zst_files = dumpdir.glob("*.zst")
bz2_files = dumpdir.glob("*.bz2")
xz_files = dumpdir.glob("*.xz")
all_files = sorted(list(chain(zst_files, bz2_files, xz_files)))
groups = groupby(all_files, key = lambda p: p.stem)
kept_paths = []
removed_paths = []
priority = ['.zst','.xz','.bz2']
for stem, files in groups:
keep_file = None
remove_files = []
for f in files:
if keep_file is None:
keep_file = f
elif priority.index(keep_file.suffix) > priority.index(f.suffix):
remove_files.append(keep_file)
keep_file = f
else:
remove_files.append(f)
kept_paths.append(keep_file)
removed_paths.extend(remove_files)
(dumpdir / "to_remove").mkdir()
for f in removed_paths:
f.rename(f.parent / "to_remove" / f.name)

View File

@@ -1,17 +0,0 @@
import pyarrow.dataset as ds
# A pyarrow dataset abstracts reading, writing, or filtering a parquet file. It does not read dataa into memory.
#dataset = ds.dataset(pathlib.Path('/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet/'), format='parquet', partitioning='hive')
dataset = ds.dataset('/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/', format='parquet')
# let's get all the comments to two subreddits:
subreddits_to_pull = ['seattle','seattlewa']
# a table is a low-level structured data format. This line pulls data into memory. Setting metadata_n_threads > 1 gives a little speed boost.
table = dataset.to_table(filter = ds.field('subreddit').isin(subreddits_to_pull), columns=['id','subreddit','CreatedAt','author','ups','downs','score','subreddit_id','stickied','title','url','is_self','selftext'])
# Since data from just these 2 subreddits fits in memory we can just turn our table into a pandas dataframe.
df = table.to_pandas()
# We should save this smaller dataset so we don't have to wait 15 min to pull from parquet next time.
df.to_csv("mydataset.csv")

View File

@@ -1,38 +0,0 @@
import pyarrow.dataset as ds
from itertools import groupby
# A pyarrow dataset abstracts reading, writing, or filtering a parquet file. It does not read dataa into memory.
dataset = ds.dataset('/gscratch/comdata/output/reddit_submissions_by_author.parquet', format='parquet')
# let's get all the comments to two subreddits:
subreddits_to_pull = ['seattlewa','seattle']
# instead of loading the data into a pandas dataframe all at once we can stream it.
scan_tasks = dataset.scan(filter = ds.field('subreddit').isin(subreddits_to_pull), columns=['id','subreddit','CreatedAt','author','ups','downs','score','subreddit_id','stickied','title','url','is_self','selftext'])
# simple function to execute scantasks and generate rows
def iterate_rows(scan_tasks):
for st in scan_tasks:
for rb in st.execute():
df = rb.to_pandas()
for t in df.itertuples():
yield t
row_iter = iterate_rows(scan_tasks)
# now we can use python's groupby function to read one author at a time
# note that the same author can appear more than once since the record batches may not be in the correct order.
author_submissions = groupby(row_iter, lambda row: row.author)
count_dict = {}
for auth, posts in author_submissions:
if auth in count_dict:
count_dict[auth] = count_dict[auth] + 1
else:
count_dict[auth] = 1
# since it's partitioned and sorted by author, we get one group for each author
any([ v != 1 for k,v in count_dict.items()])

25
ngrams/Makefile Normal file
View File

@@ -0,0 +1,25 @@
outputdir=../../data/reddit_ngrams/
inputdir=../../data/reddit_comments_by_subreddit.parquet
authors_tfdir=${outputdir}/comment_authors.parquet
srun=sbatch --wait --verbose run_job.sbatch
all: ${outputdir}/comment_authors_sorted.parquet/_SUCCESS
tf_task_list_1: tf_comments.py
${srun} bash -c "python3 tf_comments.py gen_task_list --mwe_pass='first' --outputdir=${outputdir} --tf_task_list=$@ --inputdir=${inputdir}"
${outputdir}/comment_terms.parquet:tf_task_list_1
mkdir -p sbatch_log
sbatch --wait --verbose --array=1-$(shell cat $< | wc -l) run_array.sbatch 0 $<
${outputdir}/comment_authors.parquet:${outputdir}/comment_terms.parquet
-
${outputdir}/comment_authors_sorted.parquet:${outputdir}/comment_authors.parquet sort_tf_comments.py
../start_spark_and_run.sh 3 sort_tf_comments.py --inparquet=$< --outparquet=$@ --colname=author
${outputdir}/comment_authors_sorted.parquet/_SUCCESS:${outputdir}/comment_authors_sorted.parquet
${inputdir}:
$(MAKE) -C ../datasets

19
ngrams/run_array.sbatch Executable file
View File

@@ -0,0 +1,19 @@
#!/bin/bash
#SBATCH --job-name=reddit_comment_term_frequencies
#SBATCH --account=comdata
#SBATCH --partition=compute-bigmem
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=9g
#SBATCH --ntasks=1
#SBATCH --export=ALL
#SBATCH --time=48:00:00
#SBATCH --chdir=/gscratch/comdata/users/nathante/partitioning_reddit/dataverse/cdsc_reddit/ngrams
#SBATCH --error="sbatch_log/%A_%a.out"
#SBATCH --output="sbatch_log/%A_%a.out"
TASK_NUM=$(($SLURM_ARRAY_TASK_ID + $1))
TASK_CALL=$(sed -n ${TASK_NUM}p $2)
${TASK_CALL}

18
ngrams/run_job.sbatch Normal file
View File

@@ -0,0 +1,18 @@
#!/bin/bash
#SBATCH --job-name="simulate measurement error models"
## Allocation Definition
#SBATCH --account=comdata
#SBATCH --partition=compute-bigmem
## Resources
#SBATCH --nodes=1
## Walltime (4 hours)
#SBATCH --time=4:00:00
## Memory per node
#SBATCH --mem=4G
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=1
#SBATCH --chdir /gscratch/comdata/users/nathante/partitioning_reddit/dataverse/cdsc_reddit/ngrams/
#SBATCH --output=sbatch_log/%A_%a.out
#SBATCH --error=sbatch_log/%A_%a.err
echo "$@"
"$@"

View File

@@ -1,8 +1,6 @@
#!/usr/bin/env bash #!/usr/bin/env bash
module load parallel_sql
source ./bin/activate source ./bin/activate
python3 tf_comments.py gen_task_list python3 tf_comments.py gen_task_list
psu --del --Y
cat tf_task_list | psu --load
for job in $(seq 1 50); do sbatch checkpoint_parallelsql.sbatch; done; for job in $(seq 1 50); do sbatch checkpoint_parallelsql.sbatch; done;

View File

@@ -2,12 +2,17 @@
from pyspark.sql import functions as f from pyspark.sql import functions as f
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
import fire
def main(inparquet, outparquet, colname):
spark = SparkSession.builder.getOrCreate() spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test.parquet_temp/") df = spark.read.parquet(inparquet)
df = df.repartition(2000,'term') df = df.repartition(2000,colname)
df = df.sort(['term','week','subreddit']) df = df.sort([colname,'week','subreddit'])
df = df.sortWithinPartitions(['term','week','subreddit']) df = df.sortWithinPartitions([colname,'week','subreddit'])
df.write.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test_sorted_tf.parquet_temp",mode='overwrite',compression='snappy') df.write.parquet(outparquet,mode='overwrite',compression='snappy')
if __name__ == '__main__':
fire.Fire(main)

View File

@@ -3,6 +3,7 @@ import pandas as pd
import pyarrow as pa import pyarrow as pa
import pyarrow.dataset as ds import pyarrow.dataset as ds
import pyarrow.parquet as pq import pyarrow.parquet as pq
import pyarrow.compute as pc
from itertools import groupby, islice, chain from itertools import groupby, islice, chain
import fire import fire
from collections import Counter from collections import Counter
@@ -13,26 +14,33 @@ from nltk.corpus import stopwords
from nltk.util import ngrams from nltk.util import ngrams
import string import string
from random import random from random import random
from redditcleaner import clean
# remove urls from pathlib import Path
# taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url from datetime import datetime
urlregex = re.compile(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")
# compute term frequencies for comments in each subreddit by week # compute term frequencies for comments in each subreddit by week
def weekly_tf(partition, mwe_pass = 'first'): def weekly_tf(partition, outputdir = '/gscratch/comdata/output/reddit_ngrams/', inputdir="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/", mwe_pass = 'first', excluded_users=None):
dataset = ds.dataset(f'/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/{partition}', format='parquet')
if not os.path.exists("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/"):
os.mkdir("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/")
if not os.path.exists("/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp/"): dataset = ds.dataset(Path(inputdir)/partition, format='parquet')
os.mkdir("/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp/") outputdir = Path(outputdir)
samppath = outputdir / "reddit_comment_ngrams_10p_sample"
if not samppath.exists():
samppath.mkdir(parents=True, exist_ok=True)
ngram_output = partition.replace("parquet","txt") ngram_output = partition.replace("parquet","txt")
if mwe_pass == 'first': if excluded_users is not None:
if os.path.exists(f"/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}"): excluded_users = set(map(str.strip,open(excluded_users)))
os.remove(f"/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}") df = df.filter(~ (f.col("author").isin(excluded_users)))
ngram_path = samppath / ngram_output
if mwe_pass == 'first':
if ngram_path.exists():
ngram_path.unlink()
dataset = dataset.filter(pc.field("CreatedAt") <= pa.scalar(datetime(2020,4,13)))
batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author']) batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author'])
@@ -65,8 +73,10 @@ def weekly_tf(partition, mwe_pass = 'first'):
subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week)) subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week))
mwe_path = outputdir / "multiword_expressions.feather"
if mwe_pass != 'first': if mwe_pass != 'first':
mwe_dataset = pd.read_feather(f'/gscratch/comdata/output/reddit_ngrams/multiword_expressions.feather') mwe_dataset = pd.read_feather(mwe_path)
mwe_dataset = mwe_dataset.sort_values(['phrasePWMI'],ascending=False) mwe_dataset = mwe_dataset.sort_values(['phrasePWMI'],ascending=False)
mwe_phrases = list(mwe_dataset.phrase) mwe_phrases = list(mwe_dataset.phrase)
mwe_phrases = [tuple(s.split(' ')) for s in mwe_phrases] mwe_phrases = [tuple(s.split(' ')) for s in mwe_phrases]
@@ -95,8 +105,8 @@ def weekly_tf(partition, mwe_pass = 'first'):
# lowercase # lowercase
text = text.lower() text = text.lower()
# remove urls # redditcleaner removes reddit markdown(newlines, quotes, bullet points, links, strikethrough, spoiler, code, superscript, table, headings)
text = urlregex.sub("", text) text = clean(text)
# sentence tokenize # sentence tokenize
sentences = sent_tokenize(text) sentences = sent_tokenize(text)
@@ -107,19 +117,18 @@ def weekly_tf(partition, mwe_pass = 'first'):
# remove punctuation # remove punctuation
sentences = map(remove_punct, sentences) sentences = map(remove_punct, sentences)
# remove sentences with less than 2 words
sentences = filter(lambda sentence: len(sentence) > 2, sentences)
# datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase. # datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase.
# they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
# here we take a 10 percent sample of sentences # here we take a 10 percent sample of sentences
if mwe_pass == 'first': if mwe_pass == 'first':
# remove sentences with less than 2 words
sentences = filter(lambda sentence: len(sentence) > 2, sentences)
sentences = list(sentences) sentences = list(sentences)
for sentence in sentences: for sentence in sentences:
if random() <= 0.1: if random() <= 0.1:
grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4)))) grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file: with open(ngram_path,'a') as gram_file:
for ng in grams: for ng in grams:
gram_file.write(' '.join(ng) + '\n') gram_file.write(' '.join(ng) + '\n')
for token in sentence: for token in sentence:
@@ -154,7 +163,14 @@ def weekly_tf(partition, mwe_pass = 'first'):
outchunksize = 10000 outchunksize = 10000
with pq.ParquetWriter(f"/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet/{partition}",schema=schema,compression='snappy',flavor='spark') as writer, pq.ParquetWriter(f"/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet/{partition}",schema=author_schema,compression='snappy',flavor='spark') as author_writer: termtf_outputdir = (outputdir / "comment_terms.parquet")
termtf_outputdir.mkdir(parents=True, exist_ok=True)
authortf_outputdir = (outputdir / "comment_authors.parquet")
authortf_outputdir.mkdir(parents=True, exist_ok=True)
termtf_path = termtf_outputdir / partition
authortf_path = authortf_outputdir / partition
with pq.ParquetWriter(termtf_path, schema=schema, compression='snappy', flavor='spark') as writer, \
pq.ParquetWriter(authortf_path, schema=author_schema, compression='snappy', flavor='spark') as author_writer:
while True: while True:
@@ -183,12 +199,12 @@ def weekly_tf(partition, mwe_pass = 'first'):
author_writer.close() author_writer.close()
def gen_task_list(mwe_pass='first'): def gen_task_list(mwe_pass='first', inputdir="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/", outputdir='/gscratch/comdata/output/reddit_ngrams/', tf_task_list='tf_task_list', excluded_users_file=None):
files = os.listdir("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/") files = os.listdir(inputdir)
with open("tf_task_list",'w') as outfile: with open(tf_task_list,'w') as outfile:
for f in files: for f in files:
if f.endswith(".parquet"): if f.endswith(".parquet"):
outfile.write(f"./tf_comments.py weekly_tf --mwe-pass {mwe_pass} {f}\n") outfile.write(f"./tf_comments.py weekly_tf --mwe-pass {mwe_pass} --inputdir {inputdir} --outputdir {outputdir} --excluded_users {excluded_users_file} {f}\n")
if __name__ == "__main__": if __name__ == "__main__":
fire.Fire({"gen_task_list":gen_task_list, fire.Fire({"gen_task_list":gen_task_list,

View File

@@ -1,58 +0,0 @@
from pyspark.sql import functions as f
from pyspark.sql import Window
from pyspark.sql import SparkSession
import numpy as np
spark = SparkSession.builder.getOrCreate()
df = spark.read.text("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/")
df = df.withColumnRenamed("value","phrase")
# count phrase occurrances
phrases = df.groupby('phrase').count()
phrases = phrases.withColumnRenamed('count','phraseCount')
phrases = phrases.filter(phrases.phraseCount > 10)
# count overall
N = phrases.select(f.sum(phrases.phraseCount).alias("phraseCount")).collect()[0].phraseCount
print(f'analyzing PMI on a sample of {N} phrases')
logN = np.log(N)
phrases = phrases.withColumn("phraseLogProb", f.log(f.col("phraseCount")) - logN)
# count term occurrances
phrases = phrases.withColumn('terms',f.split(f.col('phrase'),' '))
terms = phrases.select(['phrase','phraseCount','phraseLogProb',f.explode(phrases.terms).alias('term')])
win = Window.partitionBy('term')
terms = terms.withColumn('termCount',f.sum('phraseCount').over(win))
terms = terms.withColumnRenamed('count','termCount')
terms = terms.withColumn('termLogProb',f.log(f.col('termCount')) - logN)
terms = terms.groupBy(terms.phrase, terms.phraseLogProb, terms.phraseCount).sum('termLogProb')
terms = terms.withColumnRenamed('sum(termLogProb)','termsLogProb')
terms = terms.withColumn("phrasePWMI", f.col('phraseLogProb') - f.col('termsLogProb'))
# join phrases to term counts
df = terms.select(['phrase','phraseCount','phraseLogProb','phrasePWMI'])
df = df.sort(['phrasePWMI'],descending=True)
df = df.sortWithinPartitions(['phrasePWMI'],descending=True)
df.write.parquet("/gscratch/comdata/users/nathante/reddit_comment_ngrams_pwmi.parquet/",mode='overwrite',compression='snappy')
df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_comment_ngrams_pwmi.parquet/")
df.write.csv("/gscratch/comdata/users/nathante/reddit_comment_ngrams_pwmi.csv/",mode='overwrite',compression='none')
df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_comment_ngrams_pwmi.parquet")
df = df.select('phrase','phraseCount','phraseLogProb','phrasePWMI')
# choosing phrases occurring at least 3500 times in the 10% sample (35000 times) and then with a PWMI of at least 3 yeids about 65000 expressions.
#
df = df.filter(f.col('phraseCount') > 3500).filter(f.col("phrasePWMI")>3)
df = df.toPandas()
df.to_feather("/gscratch/comdata/users/nathante/reddit_multiword_expressions.feather")
df.to_csv("/gscratch/comdata/users/nathante/reddit_multiword_expressions.csv")

22
run_array.sbatch Normal file
View File

@@ -0,0 +1,22 @@
#!/bin/bash
## tf reddit comments
#SBATCH --job-name="wikia ecology; fit var models"
## Allocation Definition
#SBATCH --account=comdata-ckpt
#SBATCH --partition=ckpt
## Resources
## Nodes. This should always be 1 for parallel-sql.
#SBATCH --nodes=1
## Walltime (12 hours)
#SBATCH --time=24:00:00
## Memory per node
#SBATCH --mem=8G
#SBATCH --cpus-per-task=1
#SBATCH --ntasks=1
#SBATCH
#SBATCH --chdir /gscratch/comdata/users/nathante/wikia_ecology
#SBATCH --output=var_jobs/%A_%a.out
#SBATCH --error=var_jobs/%A_%a.out
TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1))
TASK_CALL=$(sed -n ${TASK_NUM}p ./var_jobs.sh)
${TASK_CALL}

View File

@@ -1,130 +1,28 @@
#all: /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_130k.parquet srun=srun -p compute-bigmem -A comdata --mem-per-cpu=9g --time=200:00:00 -c 40
srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh srun_huge=srun -p compute-hugemem -A comdata --mem=724g --time=200:00:00 -c 40
srun_singularity_huge=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity_huge.sh
base_data=/gscratch/comdata/output/ similarity_data=../../data/reddit_similarity
similarity_data=${base_data}/reddit_similarity
tfidf_data=${similarity_data}/tfidf tfidf_data=${similarity_data}/tfidf
tfidf_weekly_data=${similarity_data}/tfidf_weekly lsi_components=[10,50,100,200,300,400,500,600,700,850]
similarity_weekly_data=${similarity_data}/weekly
lsi_components=[10,50,100,200,300,400,500,600,700,850,1000,1500]
lsi_similarities: ${similarity_data}/subreddit_comment_terms_10k_LSI ${similarity_data}/subreddit_comment_authors-tf_10k_LSI ${similarity_data}/subreddit_comment_authors_10k_LSI ${similarity_data}/subreddit_comment_terms_30k_LSI ${similarity_data}/subreddit_comment_authors-tf_30k_LSI ${similarity_data}/subreddit_comment_authors_30k_LSI lsi_similarities: ${similarity_data}/subreddit_comment_authors-tf_10k_LSI
all: ${tfidf_data}/comment_terms_100k.parquet ${tfidf_data}/comment_terms_30k.parquet ${tfidf_data}/comment_terms_10k.parquet ${tfidf_data}/comment_authors_100k.parquet ${tfidf_data}/comment_authors_30k.parquet ${tfidf_data}/comment_authors_10k.parquet ${similarity_data}/subreddit_comment_authors_30k.feather ${similarity_data}/subreddit_comment_authors_10k.feather ${similarity_data}/subreddit_comment_terms_10k.feather ${similarity_data}/subreddit_comment_terms_30k.feather ${similarity_data}/subreddit_comment_authors-tf_30k.feather ${similarity_data}/subreddit_comment_authors-tf_10k.feather ${similarity_data}/subreddit_comment_terms_100k.feather ${similarity_data}/subreddit_comment_authors_100k.feather ${similarity_data}/subreddit_comment_authors-tf_100k.feather ${similarity_weekly_data}/comment_terms.parquet all: ${similarity_data}/subreddit_comment_authors-tf_10k.feather
#${tfidf_weekly_data}/comment_terms_100k.parquet ${tfidf_weekly_data}/comment_authors_100k.parquet ${tfidf_weekly_data}/comment_terms_30k.parquet ${tfidf_weekly_data}/comment_authors_30k.parquet ${similarity_weekly_data}/comment_terms_100k.parquet ${similarity_weekly_data}/comment_authors_100k.parquet ${similarity_weekly_data}/comment_terms_30k.parquet ${similarity_weekly_data}/comment_authors_30k.parquet ${similarity_data}/subreddit_comment_authors-tf_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
${srun_huge} /bin/bash -c "source ~/.bashrc; python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=10 --inpath=$<"
# /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_weekly_130k.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv: ../../data/reddit_submissions_by_subreddit.parquet ../../data/reddit_comments_by_subreddit.parquet
../start_spark_and_run.sh 3 top_subreddits_by_comments.py
# all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet ${tfidf_data}/comment_authors_100k.parquet: ../../data/reddit_ngrams/comment_authors_sorted.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
../start_spark_and_run.sh 3 tfidf.py authors --topN=100000 --inpath=$< --outpath=${tfidf_data}/comment_authors_100k.parquet
${similarity_weekly_data}/comment_terms.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_terms.parquet ../../data/reddit_ngrams/comment_authors_sorted.parquet:
${srun_singularity} python3 weekly_cosine_similarities.py terms --topN=10000 --outfile=${similarity_weekly_data}/comment_terms.parquet $(MAKE) -C ../ngrams
${similarity_data}/subreddit_comment_terms_10k.feather: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py ../../data/reddit_submissions_by_subreddit.parquet:
${srun_singularity} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k.feather --topN=10000 $(MAKE) -C ../datasets
${similarity_data}/subreddit_comment_terms_10k_LSI: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py ../../data/reddit_comments_by_subreddit.parquet:
${srun_singularity} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=200 $(MAKE) -C ../datasets
${similarity_data}/subreddit_comment_terms_30k_LSI: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py
${srun_singularity} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=200
${similarity_data}/subreddit_comment_terms_30k.feather: ${tfidf_data}/comment_terms_30k.parquet similarities_helper.py
${srun_singularity} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k.feather --topN=30000
${similarity_data}/subreddit_comment_authors_30k.feather: ${tfidf_data}/comment_authors_30k.parquet similarities_helper.py
${srun_singularity} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k.feather --topN=30000
${similarity_data}/subreddit_comment_authors_10k.feather: ${tfidf_data}/comment_authors_10k.parquet similarities_helper.py
${srun_singularity} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k.feather --topN=10000
${similarity_data}/subreddit_comment_authors_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
${srun_singularity} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=2
${similarity_data}/subreddit_comment_authors_30k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
${srun_singularity} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=2
${similarity_data}/subreddit_comment_authors-tf_30k.feather: ${tfidf_data}/comment_authors_30k.parquet similarities_helper.py
${srun_singularity} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k.feather --topN=30000
${similarity_data}/subreddit_comment_authors-tf_10k.feather: ${tfidf_data}/comment_authors_10k.parquet similarities_helper.py
${srun_singularity} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k.feather --topN=10000
${similarity_data}/subreddit_comment_authors-tf_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
${srun_singularity} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=2
${similarity_data}/subreddit_comment_authors-tf_30k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
${srun_singularity} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=2
${similarity_data}/subreddit_comment_terms_100k.feather: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py
${srun_singularity} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_100k.feather --topN=100000
${similarity_data}/subreddit_comment_authors_100k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
${srun_singularity} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_100k.feather --topN=100000
${similarity_data}/subreddit_comment_authors-tf_100k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
${srun_singularity} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_100k.feather --topN=100000
${tfidf_data}/comment_terms_100k.feather/: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv
mkdir -p ${tfidf_data}/
start_spark_and_run.sh 4 tfidf.py terms --topN=100000 --outpath=${tfidf_data}/comment_terms_100k.feather
${tfidf_data}/comment_terms_30k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv
mkdir -p ${tfidf_data}/
start_spark_and_run.sh 4 tfidf.py terms --topN=30000 --outpath=${tfidf_data}/comment_terms_30k.feather
${tfidf_data}/comment_terms_10k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv
mkdir -p ${tfidf_data}/
start_spark_and_run.sh 4 tfidf.py terms --topN=10000 --outpath=${tfidf_data}/comment_terms_10k.feather
${tfidf_data}/comment_authors_100k.feather: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments.csv
mkdir -p ${tfidf_data}/
start_spark_and_run.sh 4 tfidf.py authors --topN=100000 --outpath=${tfidf_data}/comment_authors_100k.feather
${tfidf_data}/comment_authors_10k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments.csv
mkdir -p ${tfidf_data}/
start_spark_and_run.sh 4 tfidf.py authors --topN=10000 --outpath=${tfidf_data}/comment_authors_10k.parquet
${tfidf_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments.csv
mkdir -p ${tfidf_data}/
start_spark_and_run.sh 4 tfidf.py authors --topN=30000 --outpath=${tfidf_data}/comment_authors_30k.parquet
${tfidf_data}/tfidf_weekly/comment_terms_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv
start_spark_and_run.sh 4 tfidf.py terms_weekly --topN=100000 --outpath=${similarity_data}/tfidf_weekly/comment_authors_100k.parquet
${tfidf_data}/tfidf_weekly/comment_authors_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_ppnum_comments.csv
start_spark_and_run.sh 4 tfidf.py authors_weekly --topN=100000 --outpath=${tfidf_weekly_data}/comment_authors_100k.parquet
${tfidf_weekly_data}/comment_terms_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv
start_spark_and_run.sh 4 tfidf.py terms_weekly --topN=30000 --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet
${tfidf_weekly_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv
start_spark_and_run.sh 4 tfidf.py authors_weekly --topN=30000 --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet
${similarity_weekly_data}/comment_terms_100k.parquet: weekly_cosine_similarities.py similarities_helper.py ${tfidf_weekly_data}/comment_terms_100k.parquet
${srun_singularity} python3 weekly_cosine_similarities.py terms --topN=100000 --outfile=${similarity_weekly_data}/comment_authors_100k.parquet
${similarity_weekly_data}/comment_authors_100k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_authors_100k.parquet
${srun_singularity} python3 weekly_cosine_similarities.py authors --topN=100000 --outfile=${similarity_weekly_data}/comment_authors_100k.parquet
${similarity_weekly_data}/comment_terms_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_terms_30k.parquet
${srun_singularity} python3 weekly_cosine_similarities.py terms --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet
${similarity_weekly_data}/comment_authors_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_authors_30k.parquet
${srun_singularity} python3 weekly_cosine_similarities.py authors --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet
# ${tfidf_weekly_data}/comment_authors_130k.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv
# start_spark_and_run.sh 1 tfidf.py authors_weekly --topN=130000
# /gscratch/comdata/output/reddit_similarity/comment_authors_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
# start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
# /gscratch/comdata/output/reddit_similarity/comment_terms.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet
# start_spark_and_run.sh 1 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
# /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet: cosine_similarities.py ${tfidf_weekly_data}/comment_authors.parquet
# start_spark_and_run.sh 1 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10000_weely.parquet
# /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
# start_spark_and_run.sh 1 cosine_similarities.py author-tf --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet

View File

@@ -11,7 +11,9 @@ def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None,
# change so that these take in an input as an optional argument (for speed, but also for idf). # change so that these take in an input as an optional argument (for speed, but also for idf).
def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', def term_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
return cosine_similarities(infile,
'term', 'term',
outfile, outfile,
min_df, min_df,
@@ -23,8 +25,8 @@ def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subredd
to_date to_date
) )
def author_cosine_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): def author_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', return cosine_similarities(infile,
'author', 'author',
outfile, outfile,
min_df, min_df,
@@ -36,8 +38,8 @@ def author_cosine_similarities(outfile, min_df=2, max_df=None, included_subreddi
to_date=to_date to_date=to_date
) )
def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): def author_tf_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', return cosine_similarities(infile,
'author', 'author',
outfile, outfile,
min_df, min_df,

View File

@@ -1,4 +1,6 @@
#!/usr/bin/bash #!/usr/bin/bash
source ~/.bashrc
echo $(hostname)
start_spark_cluster.sh start_spark_cluster.sh
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 lsi_similarities.py author --outfile=/gscratch/comdata/output//reddit_similarity/subreddit_comment_authors_10k_LSI.feather --topN=10000 spark-submit --verbose --master spark://$(hostname):43015 tfidf.py authors --topN=100000 --inpath=../../data/reddit_ngrams/comment_authors_sorted.parquet --outpath=../../data/reddit_similarity/tfidf/comment_authors_100k.parquet
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh stop-all.sh

View File

@@ -1,20 +1,42 @@
import pandas as pd import pandas as pd
import fire import fire
from pathlib import Path from pathlib import Path
from similarities_helper import similarities, lsi_column_similarities from similarities_helper import *
#from similarities_helper import similarities, lsi_column_similarities
from functools import partial from functools import partial
def lsi_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf',n_components=100,n_iter=5,random_state=1968,algorithm='arpack'): # inpath = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_authors_compex.parquet"
# term_colname='authors'
# outfile='/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_test_compex_LSI'
# n_components=[10,50,100]
# included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt"
# n_iter=5
# random_state=1968
# algorithm='randomized'
# topN = None
# from_date=None
# to_date=None
# min_df=None
# max_df=None
def lsi_similarities(inpath, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None, tfidf_colname='tf_idf',n_components=100,n_iter=5,random_state=1968,algorithm='arpack',lsi_model=None):
print(n_components,flush=True) print(n_components,flush=True)
simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm=algorithm)
return similarities(infile=infile, simfunc=simfunc, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) if lsi_model is None:
if type(n_components) == list:
lsi_model = Path(outfile) / f'{max(n_components)}_{term_colname}_LSIMOD.pkl'
else:
lsi_model = Path(outfile) / f'{n_components}_{term_colname}_LSIMOD.pkl'
simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm=algorithm,lsi_model_save=lsi_model)
return similarities(inpath=inpath, simfunc=simfunc, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
# change so that these take in an input as an optional argument (for speed, but also for idf). # change so that these take in an input as an optional argument (for speed, but also for idf).
def term_lsi_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, n_components=300,n_iter=5,random_state=1968,algorithm='arpack'): def term_lsi_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet',outfile=None, min_df=None, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None, algorithm='arpack', n_components=300,n_iter=5,random_state=1968):
return lsi_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', res = lsi_similarities(inpath,
'term', 'term',
outfile, outfile,
min_df, min_df,
@@ -23,11 +45,13 @@ def term_lsi_similarities(outfile, min_df=None, max_df=None, included_subreddits
topN, topN,
from_date, from_date,
to_date, to_date,
n_components=n_components n_components=n_components,
algorithm = algorithm
) )
return res
def author_lsi_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None,n_components=300,n_iter=5,random_state=1968,algorithm='arpack'): def author_lsi_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',outfile=None, min_df=2, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None,algorithm='arpack',n_components=300,n_iter=5,random_state=1968):
return lsi_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', return lsi_similarities(inpath,
'author', 'author',
outfile, outfile,
min_df, min_df,
@@ -39,8 +63,8 @@ def author_lsi_similarities(outfile, min_df=2, max_df=None, included_subreddits=
n_components=n_components n_components=n_components
) )
def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None,n_components=300,n_iter=5,random_state=1968,algorithm='arpack'): def author_tf_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',outfile=None, min_df=2, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None,algorithm='arpack',n_components=300,n_iter=5,random_state=1968):
return lsi_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', return lsi_similarities(inpath,
'author', 'author',
outfile, outfile,
min_df, min_df,
@@ -50,7 +74,8 @@ def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=N
from_date=from_date, from_date=from_date,
to_date=to_date, to_date=to_date,
tfidf_colname='relative_tf', tfidf_colname='relative_tf',
n_components=n_components n_components=n_components,
algorithm=algorithm
) )

View File

@@ -15,27 +15,54 @@ import numpy as np
import pathlib import pathlib
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
import pickle
class tf_weight(Enum): class tf_weight(Enum):
MaxTF = 1 MaxTF = 1
Norm05 = 2 Norm05 = 2
infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet" # infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet"
cache_file = "/gscratch/comdata/users/nathante/cdsc_reddit/similarities/term_tfidf_entries_bak.parquet" # cache_file = "/gscratch/comdata/users/nathante/cdsc_reddit/similarities/term_tfidf_entries_bak.parquet"
def termauthor_tfidf(term_tfidf_callable, author_tfidf_callable):
# subreddits missing after this step don't have any terms that have a high enough idf # subreddits missing after this step don't have any terms that have a high enough idf
# try rewriting without merges # try rewriting without merges
def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF):
print("loading tfidf", flush=True) # does reindex_tfidf, but without reindexing.
def reindex_tfidf(*args, **kwargs):
df, tfidf_ds, ds_filter = _pull_or_reindex_tfidf(*args, **kwargs, reindex=True)
print("assigning names")
subreddit_names = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id'])
batches = subreddit_names.to_batches()
with Pool(cpu_count()) as pool:
chunks = pool.imap_unordered(pull_names,batches)
subreddit_names = pd.concat(chunks,copy=False).drop_duplicates()
subreddit_names = subreddit_names.set_index("subreddit_id")
new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates()
new_ids = new_ids.set_index('subreddit_id')
subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index()
subreddit_names = subreddit_names.drop("subreddit_id",axis=1)
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
return(df, subreddit_names)
def pull_tfidf(*args, **kwargs):
df, _, _ = _pull_or_reindex_tfidf(*args, **kwargs, reindex=False)
return df
def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=None, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF, reindex=True):
print(f"loading tfidf {infile}, week {week}, min_df {min_df}, max_df {max_df}", flush=True)
if week is not None:
tfidf_ds = ds.dataset(infile, partitioning='hive')
else:
tfidf_ds = ds.dataset(infile) tfidf_ds = ds.dataset(infile)
if included_subreddits is None: if included_subreddits is None:
included_subreddits = select_topN_subreddits(topN) included_subreddits = select_topN_subreddits(topN)
else: else:
included_subreddits = set(open(included_subreddits)) included_subreddits = set(map(str.strip,open(included_subreddits)))
ds_filter = ds.field("subreddit").isin(included_subreddits) ds_filter = ds.field("subreddit").isin(included_subreddits)
@@ -71,15 +98,23 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre
'relative_tf':ds.field('relative_tf').cast('float32'), 'relative_tf':ds.field('relative_tf').cast('float32'),
'tf_idf':ds.field('tf_idf').cast('float32')} 'tf_idf':ds.field('tf_idf').cast('float32')}
tfidf_ds = ds.dataset(infile) print(projection, flush=True)
print(ds_filter, flush=True)
df = tfidf_ds.to_table(filter=ds_filter,columns=projection) df = tfidf_ds.to_table(filter=ds_filter,columns=projection)
df = df.to_pandas(split_blocks=True,self_destruct=True) df = df.to_pandas(split_blocks=True,self_destruct=True)
print("assigning indexes",flush=True) print("assigning indexes",flush=True)
df['subreddit_id_new'] = df.groupby("subreddit_id").ngroup() if reindex:
print("assigning indexes",flush=True)
df['subreddit_id_new'] = df.groupby("subreddit_id").ngroup() + 1
else:
df['subreddit_id_new'] = df['subreddit_id']
if reindex:
grouped = df.groupby(term_id) grouped = df.groupby(term_id)
df[term_id_new] = grouped.ngroup() df[term_id_new] = grouped.ngroup() + 1
else:
df[term_id_new] = df[term_id]
if rescale_idf: if rescale_idf:
print("computing idf", flush=True) print("computing idf", flush=True)
@@ -91,26 +126,13 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre
else: # tf_fam = tf_weight.Norm05 else: # tf_fam = tf_weight.Norm05
df["tf_idf"] = (0.5 + 0.5 * df.relative_tf) * df.idf df["tf_idf"] = (0.5 + 0.5 * df.relative_tf) * df.idf
print("assigning names") return (df, tfidf_ds, ds_filter)
subreddit_names = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id'])
batches = subreddit_names.to_batches()
with Pool(cpu_count()) as pool:
chunks = pool.imap_unordered(pull_names,batches)
subreddit_names = pd.concat(chunks,copy=False).drop_duplicates()
subreddit_names = subreddit_names.set_index("subreddit_id")
new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates()
new_ids = new_ids.set_index('subreddit_id')
subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index()
subreddit_names = subreddit_names.drop("subreddit_id",1)
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
return(df, subreddit_names)
def pull_names(batch): def pull_names(batch):
return(batch.to_pandas().drop_duplicates()) return(batch.to_pandas().drop_duplicates())
def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'): def similarities(inpath, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'):
''' '''
tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities. tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities.
''' '''
@@ -130,7 +152,7 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather")) output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv")) output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet")) output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
outfile.parent.mkdir(exist_ok=True, parents=True) p.parent.mkdir(exist_ok=True, parents=True)
sims.to_feather(outfile) sims.to_feather(outfile)
@@ -138,8 +160,8 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non
term_id = term + '_id' term_id = term + '_id'
term_id_new = term + '_id_new' term_id_new = term + '_id_new'
entries, subreddit_names = reindex_tfidf(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN,from_date=from_date,to_date=to_date) entries, subreddit_names = reindex_tfidf(inpath, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN,from_date=from_date,to_date=to_date)
mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new], entries.subreddit_id_new))) mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)))
print("loading matrix") print("loading matrix")
@@ -154,7 +176,7 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non
for simmat, name in sims: for simmat, name in sims:
proc_sims(simmat, Path(outfile)/(str(name) + ".feather")) proc_sims(simmat, Path(outfile)/(str(name) + ".feather"))
else: else:
proc_sims(simmat, outfile) proc_sims(sims, outfile)
def write_weekly_similarities(path, sims, week, names): def write_weekly_similarities(path, sims, week, names):
sims['week'] = week sims['week'] = week
@@ -207,10 +229,9 @@ def test_lsi_sims():
# if n_components is a list we'll return a list of similarities with different latent dimensionalities # if n_components is a list we'll return a list of similarities with different latent dimensionalities
# if algorithm is 'randomized' instead of 'arpack' then n_iter gives the number of iterations. # if algorithm is 'randomized' instead of 'arpack' then n_iter gives the number of iterations.
# this function takes the svd and then the column similarities of it # this function takes the svd and then the column similarities of it
def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized'): def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized',lsi_model_save=None,lsi_model_load=None):
# first compute the lsi of the matrix # first compute the lsi of the matrix
# then take the column similarities # then take the column similarities
print("running LSI",flush=True)
if type(n_components) is int: if type(n_components) is int:
n_components = [n_components] n_components = [n_components]
@@ -218,15 +239,28 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196
n_components = sorted(n_components,reverse=True) n_components = sorted(n_components,reverse=True)
svd_components = n_components[0] svd_components = n_components[0]
if lsi_model_load is not None and Path(lsi_model_load).exists():
print("loading LSI")
mod = pickle.load(open(lsi_model_load ,'rb'))
lsi_model_save = lsi_model_load
else:
print("running LSI",flush=True)
svd = TruncatedSVD(n_components=svd_components,random_state=random_state,algorithm=algorithm,n_iter=n_iter) svd = TruncatedSVD(n_components=svd_components,random_state=random_state,algorithm=algorithm,n_iter=n_iter)
mod = svd.fit(tfidfmat.T) mod = svd.fit(tfidfmat.T)
if lsi_model_save is not None:
Path(lsi_model_save).parent.mkdir(exist_ok=True, parents=True)
pickle.dump(mod, open(lsi_model_save,'wb'))
print(n_components, flush=True)
lsimat = mod.transform(tfidfmat.T) lsimat = mod.transform(tfidfmat.T)
for n_dims in n_components: for n_dims in n_components:
print("computing similarities", flush=True)
sims = column_similarities(lsimat[:,np.arange(n_dims)]) sims = column_similarities(lsimat[:,np.arange(n_dims)])
if len(n_components) > 1:
yield (sims, n_dims) yield (sims, n_dims)
else:
return sims
def column_similarities(mat): def column_similarities(mat):
@@ -257,20 +291,20 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig
idf = idf.withColumn('idf',f.log(idf.subreddits_in_week) / (1+f.col('count'))+1) idf = idf.withColumn('idf',f.log(idf.subreddits_in_week) / (1+f.col('count'))+1)
# collect the dictionary to make a pydict of terms to indexes # collect the dictionary to make a pydict of terms to indexes
terms = idf.select([term,'week']).distinct() # terms are distinct terms = idf.select([term]).distinct() # terms are distinct
terms = terms.withColumn(term_id,f.row_number().over(Window.partitionBy('week').orderBy(term))) # term ids are distinct terms = terms.withColumn(term_id,f.row_number().over(Window.orderBy(term))) # term ids are distinct
# make subreddit ids # make subreddit ids
subreddits = df.select(['subreddit','week']).distinct() subreddits = df.select(['subreddit']).distinct()
subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.partitionBy("week").orderBy("subreddit"))) subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit")))
df = df.join(subreddits,on=['subreddit','week']) df = df.join(subreddits,on=['subreddit'])
# map terms to indexes in the tfs and the idfs # map terms to indexes in the tfs and the idfs
df = df.join(terms,on=[term,'week']) # subreddit-term-id is unique df = df.join(terms,on=[term]) # subreddit-term-id is unique
idf = idf.join(terms,on=[term,'week']) idf = idf.join(terms,on=[term])
# join on subreddit/term to create tf/dfs indexed by term # join on subreddit/term to create tf/dfs indexed by term
df = df.join(idf, on=[term_id, term,'week']) df = df.join(idf, on=[term_id, term,'week'])
@@ -282,11 +316,11 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig
else: # tf_fam = tf_weight.Norm05 else: # tf_fam = tf_weight.Norm05
df = df.withColumn("tf_idf", (0.5 + 0.5 * df.relative_tf) * df.idf) df = df.withColumn("tf_idf", (0.5 + 0.5 * df.relative_tf) * df.idf)
df = df.repartition(400,'subreddit','week') df = df.repartition('week')
dfwriter = df.write.partitionBy("week").sortBy("subreddit") dfwriter = df.write.partitionBy("week")
return dfwriter return dfwriter
def _calc_tfidf(df, term_colname, tf_family): def _calc_tfidf(df, term_colname, tf_family, min_df=None, max_df=None):
term = term_colname term = term_colname
term_id = term + '_id' term_id = term + '_id'
@@ -304,7 +338,13 @@ def _calc_tfidf(df, term_colname, tf_family):
idf = idf.withColumn('idf',f.log(N_docs/(1+f.col('count')))+1) idf = idf.withColumn('idf',f.log(N_docs/(1+f.col('count')))+1)
# collect the dictionary to make a pydict of terms to indexes # collect the dictionary to make a pydict of terms to indexes
terms = idf.select(term).distinct() # terms are distinct terms = idf
if min_df is not None:
terms = terms.filter(f.col('count')>=min_df)
if max_df is not None:
terms = terms.filter(f.col('count')<=max_df)
terms = terms.select(term).distinct() # terms are distinct
terms = terms.withColumn(term_id,f.row_number().over(Window.orderBy(term))) # term ids are distinct terms = terms.withColumn(term_id,f.row_number().over(Window.orderBy(term))) # term ids are distinct
# make subreddit ids # make subreddit ids
@@ -314,12 +354,12 @@ def _calc_tfidf(df, term_colname, tf_family):
df = df.join(subreddits,on='subreddit') df = df.join(subreddits,on='subreddit')
# map terms to indexes in the tfs and the idfs # map terms to indexes in the tfs and the idfs
df = df.join(terms,on=term) # subreddit-term-id is unique df = df.join(terms,on=term,how='inner') # subreddit-term-id is unique
idf = idf.join(terms,on=term) idf = idf.join(terms,on=term,how='inner')
# join on subreddit/term to create tf/dfs indexed by term # join on subreddit/term to create tf/dfs indexed by term
df = df.join(idf, on=[term_id, term]) df = df.join(idf, on=[term_id, term],how='inner')
# agg terms by subreddit to make sparse tf/df vectors # agg terms by subreddit to make sparse tf/df vectors
if tf_family == tf_weight.MaxTF: if tf_family == tf_weight.MaxTF:
@@ -330,19 +370,19 @@ def _calc_tfidf(df, term_colname, tf_family):
return df return df
def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05): def tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05, min_df=None, max_df=None):
term = term_colname term = term_colname
term_id = term + '_id' term_id = term + '_id'
# aggregate counts by week. now subreddit-term is distinct # aggregate counts by week. now subreddit-term is distinct
df = df.filter(df.subreddit.isin(include_subs)) df = df.filter(df.subreddit.isin(include_subs))
df = df.groupBy(['subreddit',term]).agg(f.sum('tf').alias('tf')) df = df.groupBy(['subreddit',term]).agg(f.sum('tf').alias('tf'))
df = _calc_tfidf(df, term_colname, tf_family) df = _calc_tfidf(df, term_colname, tf_family, min_df, max_df)
df = df.repartition('subreddit') df = df.repartition('subreddit')
dfwriter = df.write.sortBy("subreddit","tf") dfwriter = df.write
return dfwriter return dfwriter
def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"): def select_topN_subreddits(topN, path="../../data/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"):
rankdf = pd.read_csv(path) rankdf = pd.read_csv(path)
included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values) included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
return included_subreddits return included_subreddits

View File

@@ -1,9 +1,12 @@
import fire import fire
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
from pyspark.sql import functions as f from pyspark.sql import functions as f
from similarities_helper import build_tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits from similarities_helper import tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits
from functools import partial
def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits): inpath = '/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_authors_compex.parquet'
# include_terms is a path to a parquet file that contains a column of term_colname + '_id' to include.
def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits, included_terms=None, min_df=None, max_df=None):
spark = SparkSession.builder.getOrCreate() spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet(inpath) df = spark.read.parquet(inpath)
@@ -11,64 +14,95 @@ def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_
df = df.filter(~ f.col(term_colname).isin(exclude)) df = df.filter(~ f.col(term_colname).isin(exclude))
if included_subreddits is not None: if included_subreddits is not None:
include_subs = list(open(included_subreddits)) include_subs = set(map(str.strip,open(included_subreddits)))
else: else:
include_subs = select_topN_subreddits(topN) include_subs = select_topN_subreddits(topN)
dfwriter = func(df, include_subs, term_colname) include_subs = spark.sparkContext.broadcast(include_subs)
# term_id = term_colname + "_id"
if included_terms is not None:
terms_df = spark.read.parquet(included_terms)
terms_df = terms_df.select(term_colname).distinct()
df = df.join(terms_df, on=term_colname, how='left_semi')
dfwriter = func(df, include_subs.value, term_colname)
dfwriter.parquet(outpath,mode='overwrite',compression='snappy') dfwriter.parquet(outpath,mode='overwrite',compression='snappy')
spark.stop() spark.stop()
def tfidf(inpath, outpath, topN, term_colname, exclude, included_subreddits): def tfidf(inpath, outpath, topN, term_colname, exclude, included_subreddits, min_df, max_df):
return _tfidf_wrapper(build_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits) tfidf_func = partial(tfidf_dataset, max_df=max_df, min_df=min_df)
return _tfidf_wrapper(tfidf_func, inpath, outpath, topN, term_colname, exclude, included_subreddits)
def tfidf_weekly(inpath, outpath, topN, term_colname, exclude, included_subreddits): def tfidf_weekly(inpath, outpath, static_tfidf_path, topN, term_colname, exclude, included_subreddits):
return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits) return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits, included_terms=static_tfidf_path)
def tfidf_authors(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
topN=25000):
return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", def tfidf_authors(inpath="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
topN=None,
included_subreddits=None,
min_df=None,
max_df=None):
return tfidf(inpath,
outpath, outpath,
topN, topN,
'author', 'author',
['[deleted]','AutoModerator'], ['[deleted]','AutoModerator'],
included_subreddits=None included_subreddits=included_subreddits,
min_df=min_df,
max_df=max_df
) )
def tfidf_terms(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet', def tfidf_terms(inpath="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
topN=25000): outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
topN=None,
included_subreddits=None,
min_df=None,
max_df=None):
return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", return tfidf(inpath,
outpath, outpath,
topN, topN,
'term', 'term',
[], [],
included_subreddits=None included_subreddits=included_subreddits,
min_df=min_df,
max_df=max_df
) )
def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', def tfidf_authors_weekly(inpath="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
topN=25000): static_tfidf_path="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet",
outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
topN=None,
included_subreddits=None):
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", return tfidf_weekly(inpath,
outpath, outpath,
static_tfidf_path,
topN, topN,
'author', 'author',
['[deleted]','AutoModerator'], ['[deleted]','AutoModerator'],
included_subreddits=None included_subreddits=included_subreddits
) )
def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', def tfidf_terms_weekly(inpath="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
topN=25000): static_tfidf_path="/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet",
outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
topN=None,
included_subreddits=None):
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", return tfidf_weekly(inpath,
outpath, outpath,
static_tfidf_path,
topN, topN,
'term', 'term',
[], [],
included_subreddits=None included_subreddits=included_subreddits
) )

View File

@@ -1,16 +1,20 @@
from pyspark.sql import functions as f from pyspark.sql import functions as f
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
from pyspark.sql import Window from pyspark.sql import Window
from datetime import datetime
from pathlib import Path
spark = SparkSession.builder.getOrCreate() spark = SparkSession.builder.getOrCreate()
conf = spark.sparkContext.getConf() conf = spark.sparkContext.getConf()
submissions = spark.read.parquet("/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet") submissions = spark.read.parquet("../../data/reddit_submissions_by_subreddit.parquet")
submissions = submissions.filter(f.col("CreatedAt") <= datetime(2020,4,13))
prop_nsfw = submissions.select(['subreddit','over_18']).groupby('subreddit').agg(f.mean(f.col('over_18').astype('double')).alias('prop_nsfw')) prop_nsfw = submissions.select(['subreddit','over_18']).groupby('subreddit').agg(f.mean(f.col('over_18').astype('double')).alias('prop_nsfw'))
df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet") df = spark.read.parquet("../../data/reddit_comments_by_subreddit.parquet")
df = df.filter(f.col("CreatedAt") <= datetime(2020,4,13))
# remove /u/ pages # remove /u/ pages
df = df.filter(~df.subreddit.like("u_%")) df = df.filter(~df.subreddit.like("u_%"))
@@ -26,4 +30,6 @@ df = df.toPandas()
df = df.sort_values("n_comments") df = df.sort_values("n_comments")
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False) outpath = Path("../../data/reddit_similarity/subreddits_by_num_comments_nonsfw.csv")
outpath.parent.mkdir(exist_ok=True, parents=True)
df.to_csv(str(outpath), index=False)

View File

@@ -1,18 +0,0 @@
from similarities_helper import similarities
import numpy as np
import fire
def wang_similarity(mat):
non_zeros = (mat != 0).astype(np.float32)
intersection = non_zeros.T @ non_zeros
return intersection
infile="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet"; outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather"; min_df=1; included_subreddits=None; topN=10000; exclude_phrases=False; from_date=None; to_date=None
def wang_overlaps(infile, outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather", min_df=1, max_df=None, included_subreddits=None, topN=10000, exclude_phrases=False, from_date=None, to_date=None):
return similarities(infile=infile, simfunc=wang_similarity, term_colname='author', outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases, from_date=from_date, to_date=to_date)
if __name__ == "__main__":
fire.Fire(wang_overlaps)

View File

@@ -1,81 +0,0 @@
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
from pyspark.sql import Window
import numpy as np
import pyarrow
import pyarrow.dataset as ds
import pandas as pd
import fire
from itertools import islice, chain
from pathlib import Path
from similarities_helper import *
from multiprocessing import Pool, cpu_count
from functools import partial
def _week_similarities(week, simfunc, tfidf_path, term_colname, min_df, max_df, included_subreddits, topN, outdir:Path):
term = term_colname
term_id = term + '_id'
term_id_new = term + '_id_new'
print(f"loading matrix: {week}")
entries, subreddit_names = reindex_tfidf(infile = tfidf_path,
term_colname=term_colname,
min_df=min_df,
max_df=max_df,
included_subreddits=included_subreddits,
topN=topN,
week=week)
mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new], entries.subreddit_id_new)))
print('computing similarities')
sims = column_similarities(mat)
del mat
sims = pd.DataFrame(sims.todense())
sims = sims.rename({i: sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1)
sims['_subreddit'] = names.subreddit.values
outfile = str(Path(outdir) / str(week))
write_weekly_similarities(outfile, sims, week, names)
def pull_weeks(batch):
return set(batch.to_pandas()['week'])
#tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_weekly.parquet')
def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, max_df=None, included_subreddits = None, topN = 500):
print(outfile)
tfidf_ds = ds.dataset(tfidf_path)
tfidf_ds = tfidf_ds.to_table(columns=["week"])
batches = tfidf_ds.to_batches()
with Pool(cpu_count()) as pool:
weeks = set(chain( * pool.imap_unordered(pull_weeks,batches)))
weeks = sorted(weeks)
# do this step in parallel if we have the memory for it.
# should be doable with pool.map
print(f"computing weekly similarities")
week_similarities_helper = partial(_week_similarities,simfunc=column_similarities, tfidf_path=tfidf_path, term_colname=term_colname, outdir=outfile, min_df=min_df,max_df=max_df,included_subreddits=included_subreddits,topN=topN)
with Pool(cpu_count()) as pool: # maybe it can be done with 40 cores on the huge machine?
list(pool.map(week_similarities_helper,weeks))
def author_cosine_similarities_weekly(outfile, min_df=2, max_df=None, included_subreddits=None, topN=500):
return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
outfile,
'author',
min_df,
max_df,
included_subreddits,
topN)
def term_cosine_similarities_weekly(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500):
return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
outfile,
'term',
min_df,
max_df,
included_subreddits,
topN)
if __name__ == "__main__":
fire.Fire({'authors':author_cosine_similarities_weekly,
'terms':term_cosine_similarities_weekly})

21
start_spark_and_run.sh Executable file
View File

@@ -0,0 +1,21 @@
#!/usr/bin/env bash
# Script to start a spark cluster and run a script on klone
source $SPARK_CONF_DIR/spark-env.sh
echo "#!/usr/bin/bash" > job_script.sh
echo "source ~/.bashrc" >> job_script.sh
echo "export PYSPARK_PYTHON=python3" >> job.script.sh
echo "export JAVA_HOME=/gscratch/comdata/local/open-jdk" >> job.script.sh
echo "export SPARK_CONF_DIR=/gscratch/comdata/local/spark_config" >> job.script.sh
echo "echo \$(hostname)" >> job_script.sh
echo "source $SPARK_CONF_DIR/spark-env.sh" >> job.script.sh
echo "start_spark_cluster.sh" >> job_script.sh
echo "spark-submit --verbose --master spark://\$(hostname):$SPARK_MASTER_PORT $2 ${@:3}" >> job_script.sh
echo "stop-all.sh" >> job_script.sh
#echo "singularity instance stop --all" >> job_script.sh
chmod +x job_script.sh
let "cpus = $1 * 40"
salloc -p compute-bigmem -A comdata --nodes=$1 --time=48:00:00 -c 40 --mem=362G --exclusive srun -n1 job_script.sh

26
start_spark_cluster.sh Executable file
View File

@@ -0,0 +1,26 @@
#!/usr/bin/env bash
nodes="$(scontrol show hostnames)"
export SPARK_MASTER_HOST=$(hostname)
echo $SPARK_MASTER_HOST
# singularity instance stop spark-boss
# rm -r $HOME/.singularity/instances/sing/$(hostname)/nathante/spark-boss
# for node in $nodes
# dol
# echo $node
# ssh $node "singularity instance stop --all -F"
# done
# singularity instance start /gscratch/comdata/users/nathante/cdsc_base.sif spark-boss
#apptainer exec /gscratch/comdata/users/nathante/containers/nathante.sif
start-master.sh
for node in $nodes
do
# if [ "$node" != "$SPARK_BOSS" ]
# then
echo $node
ssh -t $node start_spark_worker.sh $SPARK_MASTER_HOST
# fi
done

18
start_spark_worker.sh Executable file
View File

@@ -0,0 +1,18 @@
#!/usr/bin/env bash
# runs on worker node
# instance_name=spark-worker-$(hostname)
# echo $hostname
# instance_url="instance://$instance_name"
# singularity instance list
# singularity instance stop -F "$instance_name"
# singularity instance list
# sleep 5
# ls $HOME/.singularity/instances/sing/$(hostname)/nathante/$instance_name
# rm -r $HOME/.singularity/instances/sing/$(hostname)/nathante/$instance_name
# singularity instance start /gscratch/comdata/users/nathante/cdsc_base.sif $instance_name
source /gscratch/comdata/env/cdsc_klone_bashrc
source $SPARK_CONF_DIR/spark-env.sh
echo $(which python3)
echo $PYSPARK_PYTHON
echo "start-worker.sh spark://$1:$SPARK_MASTER_PORT"
start-worker.sh spark://$1:$SPARK_MASTER_PORT

View File

@@ -1,96 +0,0 @@
from pyarrow import dataset as ds
import numpy as np
import pandas as pd
import plotnine as pn
random = np.random.RandomState(1968)
def load_densities(term_density_file="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather",
author_density_file="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather"):
term_density = pd.read_feather(term_density_file)
author_density = pd.read_feather(author_density_file)
term_density.rename({'overlap_density':'term_density','index':'subreddit'},axis='columns',inplace=True)
author_density.rename({'overlap_density':'author_density','index':'subreddit'},axis='columns',inplace=True)
density = term_density.merge(author_density,on='subreddit',how='inner')
return density
def load_clusters(term_clusters_file="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather",
author_clusters_file="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather"):
term_clusters = pd.read_feather(term_clusters_file)
author_clusters = pd.read_feather(author_clusters_file)
# rename, join and return
term_clusters.rename({'cluster':'term_cluster'},axis='columns',inplace=True)
author_clusters.rename({'cluster':'author_cluster'},axis='columns',inplace=True)
clusters = term_clusters.merge(author_clusters,on='subreddit',how='inner')
return clusters
if __name__ == '__main__':
df = load_densities()
cl = load_clusters()
df['td_rank'] = df.term_density.rank()
df['ad_rank'] = df.author_density.rank()
df['td_percentile'] = df.td_rank / df.shape[0]
df['ad_percentile'] = df.ad_rank / df.shape[0]
df = df.merge(cl, on='subreddit',how='inner')
term_cluster_density = df.groupby('term_cluster').agg({'td_rank':['mean','min','max'],
'ad_rank':['mean','min','max'],
'td_percentile':['mean','min','max'],
'ad_percentile':['mean','min','max'],
'subreddit':['count']})
author_cluster_density = df.groupby('author_cluster').agg({'td_rank':['mean','min','max'],
'ad_rank':['mean','min','max'],
'td_percentile':['mean','min','max'],
'ad_percentile':['mean','min','max'],
'subreddit':['count']})
# which clusters have the most term_density?
term_cluster_density.iloc[term_cluster_density.td_rank['mean'].sort_values().index]
# which clusters have the most author_density?
term_cluster_density.iloc[term_cluster_density.ad_rank['mean'].sort_values(ascending=False).index].loc[term_cluster_density.subreddit['count'] >= 5][0:20]
high_density_term_clusters = term_cluster_density.loc[(term_cluster_density.td_percentile['mean'] > 0.75) & (term_cluster_density.subreddit['count'] > 5)]
# let's just use term density instead of author density for now. We can do a second batch with author density next.
chosen_clusters = high_density_term_clusters.sample(3,random_state=random)
cluster_info = df.loc[df.term_cluster.isin(chosen_clusters.index.values)]
chosen_subreddits = cluster_info.subreddit.values
dataset = ds.dataset("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet",format='parquet')
comments = dataset.to_table(filter=ds.field("subreddit").isin(chosen_subreddits),columns=['id','subreddit','author','CreatedAt'])
comments = comments.to_pandas()
comments['week'] = comments.CreatedAt.dt.date - pd.to_timedelta(comments['CreatedAt'].dt.dayofweek, unit='d')
author_timeseries = comments.loc[:,['subreddit','author','week']].drop_duplicates().groupby(['subreddit','week']).count().reset_index()
for clid in chosen_clusters.index.values:
ts = pd.read_feather(f"data/ts_term_cluster_{clid}.feather")
pn.options.figure_size = (11.7,8.27)
p = pn.ggplot(ts)
p = p + pn.geom_line(pn.aes('week','value',group='subreddit'))
p = p + pn.facet_wrap('~ subreddit')
p.save(f"plots/ts_term_cluster_{clid}.png")
fig, ax = pyplot.subplots(figsize=(11.7,8.27))
g = sns.FacetGrid(ts,row='subreddit')
g.map_dataframe(sns.scatterplot,'week','value',data=ts,ax=ax)

View File

@@ -1,37 +0,0 @@
import pandas as pd
import numpy as np
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
from choose_clusters import load_clusters, load_densities
import fire
from pathlib import Path
def main(term_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather",
author_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather",
term_densities_path="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather",
author_densities_path="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather",
output="data/subreddit_timeseries.parquet"):
clusters = load_clusters(term_clusters_path, author_clusters_path)
densities = load_densities(term_densities_path, author_densities_path)
spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet")
df = df.withColumn('week', f.date_trunc('week', f.col("CreatedAt")))
# time of unique authors by series by week
ts = df.select(['subreddit','week','author']).distinct().groupby(['subreddit','week']).count()
ts = ts.repartition('subreddit')
spk_clusters = spark.createDataFrame(clusters)
ts = ts.join(spk_clusters, on='subreddit', how='inner')
spk_densities = spark.createDataFrame(densities)
ts = ts.join(spk_densities, on='subreddit', how='inner')
ts.write.parquet(output, mode='overwrite')
if __name__ == "__main__":
fire.Fire(main)

View File

@@ -1 +0,0 @@
/annex/objects/SHA256E-s60874--d536adb0ec637fca262c4e1ec908dd8b4a5d1464047b583cd1a99cc6dba87191

View File

@@ -1,11 +0,0 @@
all: subreddit_author_tf_similarities_10000.html #comment_authors_10000.html
# wang_tsne_10000.html
# wang_tsne_10000.html:/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather tsne_vis.py
# python3 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather --output=wang_tsne_10000.html
# comment_authors_10000.html:/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather tsne_vis.py
# python3 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather --output=comment_authors_10000.html
subreddit_author_tf_similarities_10000.html:/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather tsne_vis.py
start_spark_and_run.sh 1 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather --output=subreddit_author_tf_similarities_10000.html

View File

@@ -1 +0,0 @@
../../.git/annex/objects/Qk/wG/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784

View File

@@ -1 +0,0 @@
../../.git/annex/objects/w7/2f/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e

View File

@@ -1 +0,0 @@
../../.git/annex/objects/WX/v3/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543

View File

@@ -1 +0,0 @@
../../.git/annex/objects/mq/2z/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1,175 +0,0 @@
import pyarrow
import altair as alt
alt.data_transformers.disable_max_rows()
alt.data_transformers.enable('default')
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from numpy import random
import fire
import numpy as np
def base_plot(plot_data):
# base = base.encode(alt.Color(field='color',type='nominal',scale=alt.Scale(scheme='category10')))
cluster_dropdown = alt.binding_select(options=[str(c) for c in sorted(set(plot_data.cluster))])
# subreddit_dropdown = alt.binding_select(options=sorted(plot_data.subreddit))
cluster_click_select = alt.selection_single(on='click',fields=['cluster'], bind=cluster_dropdown, name=' ')
# cluster_select = alt.selection_single(fields=['cluster'], bind=cluster_dropdown, name='cluster')
# cluster_select_and = cluster_click_select & cluster_select
#
# subreddit_select = alt.selection_single(on='click',fields=['subreddit'],bind=subreddit_dropdown,name='subreddit_click')
color = alt.condition(cluster_click_select ,
alt.Color(field='color',type='nominal',scale=alt.Scale(scheme='category10')),
alt.value("lightgray"))
base = alt.Chart(plot_data).mark_text().encode(
alt.X('x',axis=alt.Axis(grid=False),scale=alt.Scale(domain=(-65,65))),
alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=(-65,65))),
color=color,
text='subreddit')
base = base.add_selection(cluster_click_select)
return base
def zoom_plot(plot_data):
chart = base_plot(plot_data)
chart = chart.interactive()
chart = chart.properties(width=1275,height=800)
return chart
def viewport_plot(plot_data):
selector1 = alt.selection_interval(encodings=['x','y'],init={'x':(-65,65),'y':(-65,65)})
selectorx2 = alt.selection_interval(encodings=['x'],init={'x':(30,40)})
selectory2 = alt.selection_interval(encodings=['y'],init={'y':(-20,0)})
base = base_plot(plot_data)
viewport = base.mark_point(fillOpacity=0.2,opacity=0.2).encode(
alt.X('x',axis=alt.Axis(grid=False)),
alt.Y('y',axis=alt.Axis(grid=False)),
)
viewport = viewport.properties(width=600,height=400)
viewport1 = viewport.add_selection(selector1)
viewport2 = viewport.encode(
alt.X('x',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selector1)),
alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selector1))
)
viewport2 = viewport2.add_selection(selectorx2)
viewport2 = viewport2.add_selection(selectory2)
sr = base.encode(alt.X('x',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selectorx2)),
alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selectory2))
)
sr = sr.properties(width=1275,height=600)
chart = (viewport1 | viewport2) & sr
return chart
def assign_cluster_colors(tsne_data, clusters, n_colors, n_neighbors = 4):
tsne_data = tsne_data.merge(clusters,on='subreddit')
centroids = tsne_data.groupby('cluster').agg({'x':np.mean,'y':np.mean})
color_ids = np.arange(n_colors)
distances = np.empty(shape=(centroids.shape[0],centroids.shape[0]))
groups = tsne_data.groupby('cluster')
points = np.array(tsne_data.loc[:,['x','y']])
centers = np.array(centroids.loc[:,['x','y']])
# point x centroid
point_center_distances = np.linalg.norm((points[:,None,:] - centers[None,:,:]),axis=-1)
# distances is cluster x point
for gid, group in groups:
c_dists = point_center_distances[group.index.values,:].min(axis=0)
distances[group.cluster.values[0],] = c_dists
# nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(centroids)
# distances, indices = nbrs.kneighbors()
nearest = distances.argpartition(n_neighbors,0)
indices = nearest[:n_neighbors,:].T
# neighbor_distances = np.copy(distances)
# neighbor_distances.sort(0)
# neighbor_distances = neighbor_distances[0:n_neighbors,:]
# nbrs = NearestNeighbors(n_neighbors=n_neighbors,metric='precomputed').fit(distances)
# distances, indices = nbrs.kneighbors()
color_assignments = np.repeat(-1,len(centroids))
for i in range(len(centroids)):
knn = indices[i]
knn_colors = color_assignments[knn]
available_colors = color_ids[list(set(color_ids) - set(knn_colors))]
if(len(available_colors) > 0):
color_assignments[i] = available_colors[0]
else:
raise Exception("Can't color this many neighbors with this many colors")
centroids = centroids.reset_index()
colors = centroids.loc[:,['cluster']]
colors['color'] = color_assignments
tsne_data = tsne_data.merge(colors,on='cluster')
return(tsne_data)
def build_visualization(tsne_data, clusters, output):
# tsne_data = "/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather"
# clusters = "/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather"
tsne_data = pd.read_feather(tsne_data)
clusters = pd.read_feather(clusters)
tsne_data = assign_cluster_colors(tsne_data,clusters,10,8)
# sr_per_cluster = tsne_data.groupby('cluster').subreddit.count().reset_index()
# sr_per_cluster = sr_per_cluster.rename(columns={'subreddit':'cluster_size'})
tsne_data = tsne_data.merge(sr_per_cluster,on='cluster')
term_zoom_plot = zoom_plot(tsne_data)
term_zoom_plot.save(output)
term_viewport_plot = viewport_plot(tsne_data)
term_viewport_plot.save(output.replace(".html","_viewport.html"))
if __name__ == "__main__":
fire.Fire(build_visualization)
# commenter_data = pd.read_feather("tsne_author_fit.feather")
# clusters = pd.read_feather('author_3000_clusters.feather')
# commenter_data = assign_cluster_colors(commenter_data,clusters,10,8)
# commenter_zoom_plot = zoom_plot(commenter_data)
# commenter_viewport_plot = viewport_plot(commenter_data)
# commenter_zoom_plot.save("subreddit_commenters_tsne_3000.html")
# commenter_viewport_plot.save("subreddit_commenters_tsne_3000_viewport.html")
# chart = chart.properties(width=10000,height=10000)
# chart.save("test_tsne_whole.svg")