Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex
This commit is contained in:
commit
ce549c6c97
@ -2,9 +2,9 @@
|
|||||||
srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
|
srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
|
||||||
similarity_data=/gscratch/comdata/output/reddit_similarity
|
similarity_data=/gscratch/comdata/output/reddit_similarity
|
||||||
clustering_data=/gscratch/comdata/output/reddit_clustering
|
clustering_data=/gscratch/comdata/output/reddit_clustering
|
||||||
kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]"
|
kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
|
||||||
hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf"
|
hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
|
||||||
affinity_selection_grid="--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]"
|
affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]
|
||||||
|
|
||||||
authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather
|
authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather
|
||||||
authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI
|
authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI
|
||||||
@ -91,7 +91,11 @@ ${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_inpu
|
|||||||
${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
|
${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
|
||||||
$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
|
$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
|
||||||
|
|
||||||
|
${terms_10k_output_lsi}/best_hdbscan.feather:${terms_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
|
||||||
|
$(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2
|
||||||
|
|
||||||
|
${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
|
||||||
|
$(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2
|
||||||
|
|
||||||
clean_affinity:
|
clean_affinity:
|
||||||
rm -f ${authors_10k_output}/affinity/selection_data.csv
|
rm -f ${authors_10k_output}/affinity/selection_data.csv
|
||||||
|
@ -7,6 +7,7 @@ class grid_sweep:
|
|||||||
def __init__(self, jobtype, inpath, outpath, namer, *args):
|
def __init__(self, jobtype, inpath, outpath, namer, *args):
|
||||||
self.jobtype = jobtype
|
self.jobtype = jobtype
|
||||||
self.namer = namer
|
self.namer = namer
|
||||||
|
print(*args)
|
||||||
grid = list(product(*args))
|
grid = list(product(*args))
|
||||||
inpath = Path(inpath)
|
inpath = Path(inpath)
|
||||||
outpath = Path(outpath)
|
outpath = Path(outpath)
|
||||||
|
@ -59,7 +59,7 @@ class _hdbscan_lsi_grid_sweep(grid_sweep):
|
|||||||
|
|
||||||
self.lsi_dim = lsi_dim
|
self.lsi_dim = lsi_dim
|
||||||
self.jobtype = hdbscan_lsi_job
|
self.jobtype = hdbscan_lsi_job
|
||||||
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
|
super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def namer(self, *args, **kwargs):
|
def namer(self, *args, **kwargs):
|
||||||
@ -87,9 +87,9 @@ def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2]
|
|||||||
obj = hdbscan_lsi_grid_sweep(inpath,
|
obj = hdbscan_lsi_grid_sweep(inpath,
|
||||||
lsi_dimensions,
|
lsi_dimensions,
|
||||||
outpath,
|
outpath,
|
||||||
map(int,min_cluster_sizes),
|
list(map(int,min_cluster_sizes)),
|
||||||
map(int,min_samples),
|
list(map(int,min_samples)),
|
||||||
map(float,cluster_selection_epsilons),
|
list(map(float,cluster_selection_epsilons)),
|
||||||
cluster_selection_methods
|
cluster_selection_methods
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ class _kmeans_lsi_grid_sweep(grid_sweep):
|
|||||||
print(kwargs)
|
print(kwargs)
|
||||||
self.lsi_dim = lsi_dim
|
self.lsi_dim = lsi_dim
|
||||||
self.jobtype = kmeans_lsi_job
|
self.jobtype = kmeans_lsi_job
|
||||||
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
|
super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
|
||||||
|
|
||||||
def namer(self, *args, **kwargs):
|
def namer(self, *args, **kwargs):
|
||||||
s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
|
s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
|
||||||
|
@ -2,15 +2,15 @@ import fire
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import shutil
|
import shutil
|
||||||
|
selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv"
|
||||||
selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/affinity/selection_data.csv"
|
|
||||||
|
|
||||||
outpath = 'test_best.feather'
|
outpath = 'test_best.feather'
|
||||||
|
min_clusters=50; max_isolates=5000; min_cluster_size=2
|
||||||
|
|
||||||
# pick the best clustering according to silhouette score subject to contraints
|
# pick the best clustering according to silhouette score subject to contraints
|
||||||
def pick_best_clustering(selection_data, output, min_clusters, max_isolates):
|
def pick_best_clustering(selection_data, output, min_clusters, max_isolates, min_cluster_size):
|
||||||
df = pd.read_csv(selection_data,index_col=0)
|
df = pd.read_csv(selection_data,index_col=0)
|
||||||
df = df.sort_values("silhouette_score")
|
df = df.sort_values("silhouette_score",ascending=False)
|
||||||
|
|
||||||
# not sure I fixed the bug underlying this fully or not.
|
# not sure I fixed the bug underlying this fully or not.
|
||||||
df['n_isolates_str'] = df.n_isolates.str.strip("[]")
|
df['n_isolates_str'] = df.n_isolates.str.strip("[]")
|
||||||
@ -18,11 +18,10 @@ def pick_best_clustering(selection_data, output, min_clusters, max_isolates):
|
|||||||
df.loc[df.n_isolates_0,'n_isolates'] = 0
|
df.loc[df.n_isolates_0,'n_isolates'] = 0
|
||||||
df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l))
|
df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l))
|
||||||
|
|
||||||
best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)].iloc[df.shape[1]]
|
best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)&(df.min_cluster_size==min_cluster_size)].iloc[df.shape[1]]
|
||||||
|
|
||||||
print(best_cluster.to_dict())
|
print(best_cluster.to_dict())
|
||||||
best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather")
|
best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather")
|
||||||
|
|
||||||
shutil.copy(best_path,output)
|
shutil.copy(best_path,output)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,7 +1,38 @@
|
|||||||
import fire
|
import pandas as pd
|
||||||
from select_affinity import select_affinity_clustering
|
import plotnine as pn
|
||||||
from select_kmeans import select_kmeans_clustering
|
from pathlib import Path
|
||||||
|
from clustering.fit_tsne import fit_tsne
|
||||||
|
from visualization.tsne_vis import build_visualization
|
||||||
|
|
||||||
|
df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0)
|
||||||
|
|
||||||
|
# plot silhouette_score as a function of isolates
|
||||||
|
df = df.sort_values("silhouette_score")
|
||||||
|
|
||||||
|
df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1]))
|
||||||
|
p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point()
|
||||||
|
p.save("isolates_x_score.png")
|
||||||
|
|
||||||
|
p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point()
|
||||||
|
p.save("clusters_x_isolates.png")
|
||||||
|
|
||||||
|
# the best result for hdbscan seems like this one: it has a decent number of
|
||||||
|
# i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables
|
||||||
|
best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]]
|
||||||
|
|
||||||
|
best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]]
|
||||||
|
|
||||||
|
tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather")
|
||||||
|
|
||||||
|
if not tnse_data.exists():
|
||||||
|
fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather",
|
||||||
|
tnse_data)
|
||||||
|
|
||||||
|
build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
|
||||||
|
Path(best_eom.outpath)/(best_eom['name']+'.feather'),
|
||||||
|
"./authors-tf_lsi850_best_eom.html")
|
||||||
|
|
||||||
|
build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
|
||||||
|
Path(best_leaf.outpath)/(best_leaf['name']+'.feather'),
|
||||||
|
"./authors-tf_lsi850_best_leaf.html")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
fire.Fire({"kmeans":select_kmeans_clustering,
|
|
||||||
"affinity":select_affinity_clustering})
|
|
||||||
|
@ -8,3 +8,9 @@ all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscrat
|
|||||||
|
|
||||||
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
|
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
|
||||||
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum
|
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum
|
||||||
|
|
||||||
|
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather
|
||||||
|
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather" --agg=pd.DataFrame.sum
|
||||||
|
|
||||||
|
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather
|
||||||
|
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#!/usr/bin/bash
|
#!/usr/bin/bash
|
||||||
start_spark_cluster.sh
|
start_spark_cluster.sh
|
||||||
spark-submit --master spark://$(hostname):18899 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --outpath=/gscratch/comdata/output/reddit_density/comment_authors_10000.feather --agg=pd.DataFrame.sum
|
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum
|
||||||
stop-all.sh
|
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.core.groupby import DataFrameGroupBy as GroupBy
|
from pandas.core.groupby import DataFrameGroupBy as GroupBy
|
||||||
|
from pathlib import Path
|
||||||
import fire
|
import fire
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("..")
|
sys.path.append("..")
|
||||||
sys.path.append("../similarities")
|
sys.path.append("../similarities")
|
||||||
from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval
|
from similarities.similarities_helper import reindex_tfidf
|
||||||
|
|
||||||
# this is the mean of the ratio of the overlap to the focal size.
|
# this is the mean of the ratio of the overlap to the focal size.
|
||||||
# mean shared membership per focal community member
|
# mean shared membership per focal community member
|
||||||
@ -13,10 +14,12 @@ from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_i
|
|||||||
|
|
||||||
def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
|
def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
|
||||||
df = pd.read_feather(inpath)
|
df = pd.read_feather(inpath)
|
||||||
df = df.drop('subreddit',1)
|
df = df.drop('_subreddit',1)
|
||||||
np.fill_diagonal(df.values,0)
|
np.fill_diagonal(df.values,0)
|
||||||
df = agg(df, 0).reset_index()
|
df = agg(df, 0).reset_index()
|
||||||
df = df.rename({0:'overlap_density'},axis='columns')
|
df = df.rename({0:'overlap_density'},axis='columns')
|
||||||
|
outpath = Path(outpath)
|
||||||
|
outpath.parent.mkdir(parents=True, exist_ok = True)
|
||||||
df.to_feather(outpath)
|
df.to_feather(outpath)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
@ -25,6 +28,8 @@ def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
|
|||||||
# exclude the diagonal
|
# exclude the diagonal
|
||||||
df = df.loc[df.subreddit != df.variable]
|
df = df.loc[df.subreddit != df.variable]
|
||||||
res = agg(df.groupby(['subreddit','week'])).reset_index()
|
res = agg(df.groupby(['subreddit','week'])).reset_index()
|
||||||
|
outpath = Path(outpath)
|
||||||
|
outpath.parent.mkdir(parents=True, exist_ok = True)
|
||||||
res.to_feather(outpath)
|
res.to_feather(outpath)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
@ -8,7 +8,5 @@ wget -r --no-parent -A 'RC_201*.bz2' -U $user_agent -P $output_dir -nd -nc $base
|
|||||||
wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url
|
wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url
|
||||||
wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url
|
wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url
|
||||||
|
|
||||||
# starting in 2020 we use daily dumps not monthly dumps
|
|
||||||
wget -r --no-parent -A 'RC_202*.gz' -U $user_agent -P $output_dir -nd -nc $base_url/daily/
|
|
||||||
|
|
||||||
./check_comments_shas.py
|
./check_comments_shas.py
|
||||||
|
@ -4,44 +4,49 @@ from pathlib import Path
|
|||||||
from similarities_helper import similarities, column_similarities
|
from similarities_helper import similarities, column_similarities
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'):
|
def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'):
|
||||||
|
|
||||||
return similarities(inpath=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
|
return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
|
||||||
|
|
||||||
# change so that these take in an input as an optional argument (for speed, but also for idf).
|
# change so that these take in an input as an optional argument (for speed, but also for idf).
|
||||||
def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
|
def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
|
||||||
|
|
||||||
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet',
|
def term_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
|
||||||
|
|
||||||
|
return cosine_similarities(infile,
|
||||||
'term',
|
'term',
|
||||||
outfile,
|
outfile,
|
||||||
min_df,
|
min_df,
|
||||||
max_df,
|
max_df,
|
||||||
included_subreddits,
|
included_subreddits,
|
||||||
topN,
|
topN,
|
||||||
|
exclude_phrases,
|
||||||
from_date,
|
from_date,
|
||||||
to_date
|
to_date
|
||||||
)
|
)
|
||||||
|
|
||||||
def author_cosine_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
|
def author_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
|
||||||
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',
|
return cosine_similarities(infile,
|
||||||
'author',
|
'author',
|
||||||
outfile,
|
outfile,
|
||||||
min_df,
|
min_df,
|
||||||
max_df,
|
max_df,
|
||||||
included_subreddits,
|
included_subreddits,
|
||||||
topN,
|
topN,
|
||||||
|
exclude_phrases=False,
|
||||||
from_date=from_date,
|
from_date=from_date,
|
||||||
to_date=to_date
|
to_date=to_date
|
||||||
)
|
)
|
||||||
|
|
||||||
def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
|
def author_tf_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
|
||||||
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',
|
return cosine_similarities(infile,
|
||||||
'author',
|
'author',
|
||||||
outfile,
|
outfile,
|
||||||
min_df,
|
min_df,
|
||||||
max_df,
|
max_df,
|
||||||
included_subreddits,
|
included_subreddits,
|
||||||
topN,
|
topN,
|
||||||
|
exclude_phrases=False,
|
||||||
from_date=from_date,
|
from_date=from_date,
|
||||||
to_date=to_date,
|
to_date=to_date,
|
||||||
tfidf_colname='relative_tf'
|
tfidf_colname='relative_tf'
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#!/usr/bin/bash
|
#!/usr/bin/bash
|
||||||
start_spark_cluster.sh
|
start_spark_cluster.sh
|
||||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 lsi_similarities.py author --outfile=/gscratch/comdata/output//reddit_similarity/subreddit_comment_authors_10k_LSI.feather --topN=10000
|
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname):7077 top_subreddits_by_comments.py
|
||||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh
|
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh
|
||||||
|
@ -97,6 +97,7 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu
|
|||||||
'relative_tf':ds.field('relative_tf').cast('float32'),
|
'relative_tf':ds.field('relative_tf').cast('float32'),
|
||||||
'tf_idf':ds.field('tf_idf').cast('float32')}
|
'tf_idf':ds.field('tf_idf').cast('float32')}
|
||||||
|
|
||||||
|
|
||||||
df = tfidf_ds.to_table(filter=ds_filter,columns=projection)
|
df = tfidf_ds.to_table(filter=ds_filter,columns=projection)
|
||||||
|
|
||||||
df = df.to_pandas(split_blocks=True,self_destruct=True)
|
df = df.to_pandas(split_blocks=True,self_destruct=True)
|
||||||
@ -124,6 +125,17 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu
|
|||||||
|
|
||||||
return (df, tfidf_ds, ds_filter)
|
return (df, tfidf_ds, ds_filter)
|
||||||
|
|
||||||
|
with Pool(cpu_count()) as pool:
|
||||||
|
chunks = pool.imap_unordered(pull_names,batches)
|
||||||
|
subreddit_names = pd.concat(chunks,copy=False).drop_duplicates()
|
||||||
|
|
||||||
|
subreddit_names = subreddit_names.set_index("subreddit_id")
|
||||||
|
new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates()
|
||||||
|
new_ids = new_ids.set_index('subreddit_id')
|
||||||
|
subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index()
|
||||||
|
subreddit_names = subreddit_names.drop("subreddit_id",1)
|
||||||
|
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||||
|
return(df, subreddit_names)
|
||||||
|
|
||||||
def pull_names(batch):
|
def pull_names(batch):
|
||||||
return(batch.to_pandas().drop_duplicates())
|
return(batch.to_pandas().drop_duplicates())
|
||||||
@ -165,7 +177,6 @@ def similarities(inpath, simfunc, term_colname, outfile, min_df=None, max_df=Non
|
|||||||
|
|
||||||
print(f'computing similarities on mat. mat.shape:{mat.shape}')
|
print(f'computing similarities on mat. mat.shape:{mat.shape}')
|
||||||
print(f"size of mat is:{mat.data.nbytes}",flush=True)
|
print(f"size of mat is:{mat.data.nbytes}",flush=True)
|
||||||
# transform this to debug term tfidf
|
|
||||||
sims = simfunc(mat)
|
sims = simfunc(mat)
|
||||||
del mat
|
del mat
|
||||||
|
|
||||||
@ -257,12 +268,11 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196
|
|||||||
else:
|
else:
|
||||||
return sims
|
return sims
|
||||||
|
|
||||||
|
|
||||||
def column_similarities(mat):
|
def column_similarities(mat):
|
||||||
return 1 - pairwise_distances(mat,metric='cosine')
|
return 1 - pairwise_distances(mat,metric='cosine')
|
||||||
|
|
||||||
# need to rewrite this so that subreddit ids and term ids are fixed over the whole thing.
|
|
||||||
# this affords taking the LSI similarities.
|
|
||||||
# fill all 0s if we don't have it.
|
|
||||||
def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05):
|
def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05):
|
||||||
term = term_colname
|
term = term_colname
|
||||||
term_id = term + '_id'
|
term_id = term + '_id'
|
||||||
@ -295,7 +305,6 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig
|
|||||||
subreddits = df.select(['subreddit']).distinct()
|
subreddits = df.select(['subreddit']).distinct()
|
||||||
subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit")))
|
subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit")))
|
||||||
|
|
||||||
# df = df.cache()
|
|
||||||
df = df.join(subreddits,on=['subreddit'])
|
df = df.join(subreddits,on=['subreddit'])
|
||||||
|
|
||||||
# map terms to indexes in the tfs and the idfs
|
# map terms to indexes in the tfs and the idfs
|
||||||
|
@ -52,7 +52,7 @@ def tfidf_terms(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/commen
|
|||||||
|
|
||||||
def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
|
def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
|
||||||
topN=None,
|
topN=None,
|
||||||
include_subreddits=None):
|
included_subreddits=None):
|
||||||
|
|
||||||
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
|
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
|
||||||
outpath,
|
outpath,
|
||||||
@ -63,7 +63,8 @@ def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfi
|
|||||||
)
|
)
|
||||||
|
|
||||||
def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
|
def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
|
||||||
topN=25000):
|
topN=None,
|
||||||
|
included_subreddits=None):
|
||||||
|
|
||||||
|
|
||||||
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
|
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
|
||||||
@ -71,7 +72,7 @@ def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf
|
|||||||
topN,
|
topN,
|
||||||
'term',
|
'term',
|
||||||
[],
|
[],
|
||||||
included_subreddits=None
|
included_subreddits=included_subreddits
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ df = df.filter(~df.subreddit.like("u_%"))
|
|||||||
df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments"))
|
df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments"))
|
||||||
|
|
||||||
df = df.join(prop_nsfw,on='subreddit')
|
df = df.join(prop_nsfw,on='subreddit')
|
||||||
df = df.filter(df.prop_nsfw < 0.5)
|
#df = df.filter(df.prop_nsfw < 0.5)
|
||||||
|
|
||||||
win = Window.orderBy(f.col('n_comments').desc())
|
win = Window.orderBy(f.col('n_comments').desc())
|
||||||
df = df.withColumn('comments_rank', f.rank().over(win))
|
df = df.withColumn('comments_rank', f.rank().over(win))
|
||||||
@ -26,4 +26,4 @@ df = df.toPandas()
|
|||||||
|
|
||||||
df = df.sort_values("n_comments")
|
df = df.sort_values("n_comments")
|
||||||
|
|
||||||
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False)
|
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nsfw.csv', index=False)
|
||||||
|
Loading…
Reference in New Issue
Block a user