13
0

Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex

This commit is contained in:
Nathan TeBlunthuis 2021-08-03 15:13:21 -07:00
commit ce549c6c97
15 changed files with 103 additions and 44 deletions

View File

@ -2,9 +2,9 @@
srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
similarity_data=/gscratch/comdata/output/reddit_similarity similarity_data=/gscratch/comdata/output/reddit_similarity
clustering_data=/gscratch/comdata/output/reddit_clustering clustering_data=/gscratch/comdata/output/reddit_clustering
kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]" kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf" hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
affinity_selection_grid="--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]" affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]
authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather
authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI
@ -91,7 +91,11 @@ ${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_inpu
${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid) $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
${terms_10k_output_lsi}/best_hdbscan.feather:${terms_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
$(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2
${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
$(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2
clean_affinity: clean_affinity:
rm -f ${authors_10k_output}/affinity/selection_data.csv rm -f ${authors_10k_output}/affinity/selection_data.csv

View File

@ -7,6 +7,7 @@ class grid_sweep:
def __init__(self, jobtype, inpath, outpath, namer, *args): def __init__(self, jobtype, inpath, outpath, namer, *args):
self.jobtype = jobtype self.jobtype = jobtype
self.namer = namer self.namer = namer
print(*args)
grid = list(product(*args)) grid = list(product(*args))
inpath = Path(inpath) inpath = Path(inpath)
outpath = Path(outpath) outpath = Path(outpath)

View File

@ -59,7 +59,7 @@ class _hdbscan_lsi_grid_sweep(grid_sweep):
self.lsi_dim = lsi_dim self.lsi_dim = lsi_dim
self.jobtype = hdbscan_lsi_job self.jobtype = hdbscan_lsi_job
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
def namer(self, *args, **kwargs): def namer(self, *args, **kwargs):
@ -87,9 +87,9 @@ def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2]
obj = hdbscan_lsi_grid_sweep(inpath, obj = hdbscan_lsi_grid_sweep(inpath,
lsi_dimensions, lsi_dimensions,
outpath, outpath,
map(int,min_cluster_sizes), list(map(int,min_cluster_sizes)),
map(int,min_samples), list(map(int,min_samples)),
map(float,cluster_selection_epsilons), list(map(float,cluster_selection_epsilons)),
cluster_selection_methods cluster_selection_methods
) )

View File

@ -34,7 +34,7 @@ class _kmeans_lsi_grid_sweep(grid_sweep):
print(kwargs) print(kwargs)
self.lsi_dim = lsi_dim self.lsi_dim = lsi_dim
self.jobtype = kmeans_lsi_job self.jobtype = kmeans_lsi_job
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
def namer(self, *args, **kwargs): def namer(self, *args, **kwargs):
s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs) s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)

View File

@ -2,15 +2,15 @@ import fire
import pandas as pd import pandas as pd
from pathlib import Path from pathlib import Path
import shutil import shutil
selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv"
selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/affinity/selection_data.csv"
outpath = 'test_best.feather' outpath = 'test_best.feather'
min_clusters=50; max_isolates=5000; min_cluster_size=2
# pick the best clustering according to silhouette score subject to contraints # pick the best clustering according to silhouette score subject to contraints
def pick_best_clustering(selection_data, output, min_clusters, max_isolates): def pick_best_clustering(selection_data, output, min_clusters, max_isolates, min_cluster_size):
df = pd.read_csv(selection_data,index_col=0) df = pd.read_csv(selection_data,index_col=0)
df = df.sort_values("silhouette_score") df = df.sort_values("silhouette_score",ascending=False)
# not sure I fixed the bug underlying this fully or not. # not sure I fixed the bug underlying this fully or not.
df['n_isolates_str'] = df.n_isolates.str.strip("[]") df['n_isolates_str'] = df.n_isolates.str.strip("[]")
@ -18,11 +18,10 @@ def pick_best_clustering(selection_data, output, min_clusters, max_isolates):
df.loc[df.n_isolates_0,'n_isolates'] = 0 df.loc[df.n_isolates_0,'n_isolates'] = 0
df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l)) df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l))
best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)].iloc[df.shape[1]] best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)&(df.min_cluster_size==min_cluster_size)].iloc[df.shape[1]]
print(best_cluster.to_dict()) print(best_cluster.to_dict())
best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather") best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather")
shutil.copy(best_path,output) shutil.copy(best_path,output)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,7 +1,38 @@
import fire import pandas as pd
from select_affinity import select_affinity_clustering import plotnine as pn
from select_kmeans import select_kmeans_clustering from pathlib import Path
from clustering.fit_tsne import fit_tsne
from visualization.tsne_vis import build_visualization
df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0)
# plot silhouette_score as a function of isolates
df = df.sort_values("silhouette_score")
df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1]))
p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point()
p.save("isolates_x_score.png")
p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point()
p.save("clusters_x_isolates.png")
# the best result for hdbscan seems like this one: it has a decent number of
# i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables
best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]]
best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]]
tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather")
if not tnse_data.exists():
fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather",
tnse_data)
build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
Path(best_eom.outpath)/(best_eom['name']+'.feather'),
"./authors-tf_lsi850_best_eom.html")
build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
Path(best_leaf.outpath)/(best_leaf['name']+'.feather'),
"./authors-tf_lsi850_best_leaf.html")
if __name__ == "__main__":
fire.Fire({"kmeans":select_kmeans_clustering,
"affinity":select_affinity_clustering})

View File

@ -8,3 +8,9 @@ all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscrat
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather" --agg=pd.DataFrame.sum
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum

View File

@ -1,4 +1,4 @@
#!/usr/bin/bash #!/usr/bin/bash
start_spark_cluster.sh start_spark_cluster.sh
spark-submit --master spark://$(hostname):18899 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --outpath=/gscratch/comdata/output/reddit_density/comment_authors_10000.feather --agg=pd.DataFrame.sum singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum
stop-all.sh singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh

View File

@ -1,11 +1,12 @@
import pandas as pd import pandas as pd
from pandas.core.groupby import DataFrameGroupBy as GroupBy from pandas.core.groupby import DataFrameGroupBy as GroupBy
from pathlib import Path
import fire import fire
import numpy as np import numpy as np
import sys import sys
sys.path.append("..") sys.path.append("..")
sys.path.append("../similarities") sys.path.append("../similarities")
from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval from similarities.similarities_helper import reindex_tfidf
# this is the mean of the ratio of the overlap to the focal size. # this is the mean of the ratio of the overlap to the focal size.
# mean shared membership per focal community member # mean shared membership per focal community member
@ -13,10 +14,12 @@ from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_i
def overlap_density(inpath, outpath, agg = pd.DataFrame.sum): def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
df = pd.read_feather(inpath) df = pd.read_feather(inpath)
df = df.drop('subreddit',1) df = df.drop('_subreddit',1)
np.fill_diagonal(df.values,0) np.fill_diagonal(df.values,0)
df = agg(df, 0).reset_index() df = agg(df, 0).reset_index()
df = df.rename({0:'overlap_density'},axis='columns') df = df.rename({0:'overlap_density'},axis='columns')
outpath = Path(outpath)
outpath.parent.mkdir(parents=True, exist_ok = True)
df.to_feather(outpath) df.to_feather(outpath)
return df return df
@ -25,6 +28,8 @@ def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
# exclude the diagonal # exclude the diagonal
df = df.loc[df.subreddit != df.variable] df = df.loc[df.subreddit != df.variable]
res = agg(df.groupby(['subreddit','week'])).reset_index() res = agg(df.groupby(['subreddit','week'])).reset_index()
outpath = Path(outpath)
outpath.parent.mkdir(parents=True, exist_ok = True)
res.to_feather(outpath) res.to_feather(outpath)
return res return res

View File

@ -8,7 +8,5 @@ wget -r --no-parent -A 'RC_201*.bz2' -U $user_agent -P $output_dir -nd -nc $base
wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url
wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url
# starting in 2020 we use daily dumps not monthly dumps
wget -r --no-parent -A 'RC_202*.gz' -U $user_agent -P $output_dir -nd -nc $base_url/daily/
./check_comments_shas.py ./check_comments_shas.py

View File

@ -4,44 +4,49 @@ from pathlib import Path
from similarities_helper import similarities, column_similarities from similarities_helper import similarities, column_similarities
from functools import partial from functools import partial
def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'): def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'):
return similarities(inpath=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
# change so that these take in an input as an optional argument (for speed, but also for idf). # change so that these take in an input as an optional argument (for speed, but also for idf).
def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', def term_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
return cosine_similarities(infile,
'term', 'term',
outfile, outfile,
min_df, min_df,
max_df, max_df,
included_subreddits, included_subreddits,
topN, topN,
exclude_phrases,
from_date, from_date,
to_date to_date
) )
def author_cosine_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): def author_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', return cosine_similarities(infile,
'author', 'author',
outfile, outfile,
min_df, min_df,
max_df, max_df,
included_subreddits, included_subreddits,
topN, topN,
exclude_phrases=False,
from_date=from_date, from_date=from_date,
to_date=to_date to_date=to_date
) )
def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): def author_tf_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', return cosine_similarities(infile,
'author', 'author',
outfile, outfile,
min_df, min_df,
max_df, max_df,
included_subreddits, included_subreddits,
topN, topN,
exclude_phrases=False,
from_date=from_date, from_date=from_date,
to_date=to_date, to_date=to_date,
tfidf_colname='relative_tf' tfidf_colname='relative_tf'

View File

@ -1,4 +1,4 @@
#!/usr/bin/bash #!/usr/bin/bash
start_spark_cluster.sh start_spark_cluster.sh
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 lsi_similarities.py author --outfile=/gscratch/comdata/output//reddit_similarity/subreddit_comment_authors_10k_LSI.feather --topN=10000 singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname):7077 top_subreddits_by_comments.py
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh

View File

@ -97,6 +97,7 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu
'relative_tf':ds.field('relative_tf').cast('float32'), 'relative_tf':ds.field('relative_tf').cast('float32'),
'tf_idf':ds.field('tf_idf').cast('float32')} 'tf_idf':ds.field('tf_idf').cast('float32')}
df = tfidf_ds.to_table(filter=ds_filter,columns=projection) df = tfidf_ds.to_table(filter=ds_filter,columns=projection)
df = df.to_pandas(split_blocks=True,self_destruct=True) df = df.to_pandas(split_blocks=True,self_destruct=True)
@ -124,6 +125,17 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu
return (df, tfidf_ds, ds_filter) return (df, tfidf_ds, ds_filter)
with Pool(cpu_count()) as pool:
chunks = pool.imap_unordered(pull_names,batches)
subreddit_names = pd.concat(chunks,copy=False).drop_duplicates()
subreddit_names = subreddit_names.set_index("subreddit_id")
new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates()
new_ids = new_ids.set_index('subreddit_id')
subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index()
subreddit_names = subreddit_names.drop("subreddit_id",1)
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
return(df, subreddit_names)
def pull_names(batch): def pull_names(batch):
return(batch.to_pandas().drop_duplicates()) return(batch.to_pandas().drop_duplicates())
@ -165,7 +177,6 @@ def similarities(inpath, simfunc, term_colname, outfile, min_df=None, max_df=Non
print(f'computing similarities on mat. mat.shape:{mat.shape}') print(f'computing similarities on mat. mat.shape:{mat.shape}')
print(f"size of mat is:{mat.data.nbytes}",flush=True) print(f"size of mat is:{mat.data.nbytes}",flush=True)
# transform this to debug term tfidf
sims = simfunc(mat) sims = simfunc(mat)
del mat del mat
@ -256,13 +267,12 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196
yield (sims, n_dims) yield (sims, n_dims)
else: else:
return sims return sims
def column_similarities(mat): def column_similarities(mat):
return 1 - pairwise_distances(mat,metric='cosine') return 1 - pairwise_distances(mat,metric='cosine')
# need to rewrite this so that subreddit ids and term ids are fixed over the whole thing.
# this affords taking the LSI similarities.
# fill all 0s if we don't have it.
def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05): def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05):
term = term_colname term = term_colname
term_id = term + '_id' term_id = term + '_id'
@ -295,7 +305,6 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig
subreddits = df.select(['subreddit']).distinct() subreddits = df.select(['subreddit']).distinct()
subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit"))) subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit")))
# df = df.cache()
df = df.join(subreddits,on=['subreddit']) df = df.join(subreddits,on=['subreddit'])
# map terms to indexes in the tfs and the idfs # map terms to indexes in the tfs and the idfs

View File

@ -52,7 +52,7 @@ def tfidf_terms(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/commen
def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
topN=None, topN=None,
include_subreddits=None): included_subreddits=None):
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
outpath, outpath,
@ -63,7 +63,8 @@ def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfi
) )
def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
topN=25000): topN=None,
included_subreddits=None):
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
@ -71,7 +72,7 @@ def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf
topN, topN,
'term', 'term',
[], [],
included_subreddits=None included_subreddits=included_subreddits
) )

View File

@ -17,7 +17,7 @@ df = df.filter(~df.subreddit.like("u_%"))
df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments")) df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments"))
df = df.join(prop_nsfw,on='subreddit') df = df.join(prop_nsfw,on='subreddit')
df = df.filter(df.prop_nsfw < 0.5) #df = df.filter(df.prop_nsfw < 0.5)
win = Window.orderBy(f.col('n_comments').desc()) win = Window.orderBy(f.col('n_comments').desc())
df = df.withColumn('comments_rank', f.rank().over(win)) df = df.withColumn('comments_rank', f.rank().over(win))
@ -26,4 +26,4 @@ df = df.toPandas()
df = df.sort_values("n_comments") df = df.sort_values("n_comments")
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False) df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nsfw.csv', index=False)