Refactor and reorganze.
This commit is contained in:
parent
a60747292e
commit
e6294b5b90
@ -1,119 +0,0 @@
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import Window
|
||||
import numpy as np
|
||||
import pyarrow
|
||||
import pandas as pd
|
||||
import fire
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from similarities_helper import cosine_similarities, prep_tfidf_entries, read_tfidf_matrix, column_similarities
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
conf = spark.sparkContext.getConf()
|
||||
|
||||
# outfile = '/gscratch/comdata/users/nathante/test_similarities_500.feather'; min_df = None; included_subreddits=None; similarity_threshold=0;
|
||||
def author_cosine_similarities(outfile, min_df = None, included_subreddits=None, similarity_threshold=0, topN=500):
|
||||
'''
|
||||
Compute similarities between subreddits based on tfi-idf vectors of author comments
|
||||
|
||||
included_subreddits : string
|
||||
Text file containing a list of subreddits to include (one per line) if included_subreddits is None then do the top 500 subreddits
|
||||
|
||||
similarity_threshold : double (default = 0)
|
||||
set > 0 for large numbers of subreddits to get an approximate solution using the DIMSUM algorithm
|
||||
https://stanford.edu/~rezab/papers/dimsum.pdf. If similarity_threshold=0 we get an exact solution using an O(N^2) algorithm.
|
||||
|
||||
min_df : int (default = 0.1 * (number of included_subreddits)
|
||||
exclude terms that appear in fewer than this number of documents.
|
||||
|
||||
outfile: string
|
||||
where to output csv and feather outputs
|
||||
'''
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
conf = spark.sparkContext.getConf()
|
||||
print(outfile)
|
||||
|
||||
tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_authors.parquet')
|
||||
|
||||
if included_subreddits is None:
|
||||
rankdf = pd.read_csv("/gscratch/comdata/users/nathante/cdsc-reddit/subreddits_by_num_comments.csv")
|
||||
included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
|
||||
|
||||
else:
|
||||
included_subreddits = set(open(included_subreddits))
|
||||
|
||||
print("creating temporary parquet with matrix indicies")
|
||||
tempdir = prep_tfidf_entries(tfidf, 'author', min_df, included_subreddits)
|
||||
tfidf = spark.read.parquet(tempdir.name)
|
||||
subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
|
||||
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||
spark.stop()
|
||||
|
||||
print("loading matrix")
|
||||
mat = read_tfidf_matrix(tempdir.name,'author')
|
||||
print('computing similarities')
|
||||
sims = column_similarities(mat)
|
||||
del mat
|
||||
|
||||
sims = pd.DataFrame(sims.todense())
|
||||
sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)},axis=1)
|
||||
sims['subreddit'] = subreddit_names.subreddit.values
|
||||
|
||||
p = Path(outfile)
|
||||
|
||||
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
||||
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
||||
output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
||||
|
||||
sims.to_feather(outfile)
|
||||
tempdir.cleanup()
|
||||
|
||||
# print(outfile)
|
||||
|
||||
# tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_authors.parquet')
|
||||
|
||||
# if included_subreddits is None:
|
||||
# included_subreddits = list(islice(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"),topN))
|
||||
# included_subreddits = {s.strip('\n') for s in included_subreddits}
|
||||
|
||||
# else:
|
||||
# included_subreddits = set(open(included_subreddits))
|
||||
|
||||
# sim_dist, tfidf = cosine_similarities(tfidf, 'author', min_df, included_subreddits, similarity_threshold)
|
||||
|
||||
# p = Path(outfile)
|
||||
|
||||
# output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
||||
# output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
||||
# output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
||||
# sim_dist = sim_dist.entries.toDF()
|
||||
|
||||
# sim_dist = sim_dist.repartition(1)
|
||||
# sim_dist.write.parquet(str(output_parquet),mode='overwrite',compression='snappy')
|
||||
|
||||
|
||||
|
||||
# #instead of toLocalMatrix() why not read as entries and put strait into numpy
|
||||
# sim_entries = pd.read_parquet(output_parquet)
|
||||
|
||||
# df = tfidf.select('subreddit','subreddit_id_new').distinct().toPandas()
|
||||
|
||||
# spark.stop()
|
||||
# df['subreddit_id_new'] = df['subreddit_id_new'] - 1
|
||||
# df = df.sort_values('subreddit_id_new').reset_index(drop=True)
|
||||
# df = df.set_index('subreddit_id_new')
|
||||
|
||||
# similarities = sim_entries.join(df, on='i')
|
||||
# similarities = similarities.rename(columns={'subreddit':"subreddit_i"})
|
||||
# similarities = similarities.join(df, on='j')
|
||||
# similarities = similarities.rename(columns={'subreddit':"subreddit_j"})
|
||||
|
||||
# similarities.to_feather(output_feather)
|
||||
# similarities.to_csv(output_csv)
|
||||
# return similarities
|
||||
|
||||
if __name__ == '__main__':
|
||||
fire.Fire(author_cosine_similarities)
|
@ -16,6 +16,8 @@ def affinity_clustering(similarities, output, damping=0.5, max_iter=100000, conv
|
||||
|
||||
preference = np.quantile(mat,preference_quantile)
|
||||
|
||||
print("data loaded")
|
||||
|
||||
clustering = AffinityPropagation(damping=damping,
|
||||
max_iter=max_iter,
|
||||
convergence_iter=convergence_iter,
|
4
datasets/job_script.sh
Executable file
4
datasets/job_script.sh
Executable file
@ -0,0 +1,4 @@
|
||||
#!/usr/bin/bash
|
||||
start_spark_cluster.sh
|
||||
spark-submit --master spark://$(hostname):18899 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/users/nathante/subreddit_term_similarity_weekly_5000.parquet --topN=5000
|
||||
stop-all.sh
|
0
ngrams/#ngrams_helper.py#
Normal file
0
ngrams/#ngrams_helper.py#
Normal file
26
ngrams/checkpoint_parallelsql.sbatch
Normal file
26
ngrams/checkpoint_parallelsql.sbatch
Normal file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
## parallel_sql_job.sh
|
||||
#SBATCH --job-name=tf_subreddit_comments
|
||||
## Allocation Definition
|
||||
#SBATCH --account=comdata-ckpt
|
||||
#SBATCH --partition=ckpt
|
||||
## Resources
|
||||
## Nodes. This should always be 1 for parallel-sql.
|
||||
#SBATCH --nodes=1
|
||||
## Walltime (12 hours)
|
||||
#SBATCH --time=12:00:00
|
||||
## Memory per node
|
||||
#SBATCH --mem=32G
|
||||
#SBATCH --cpus-per-task=4
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH -D /gscratch/comdata/users/nathante/cdsc-reddit
|
||||
source ./bin/activate
|
||||
module load parallel_sql
|
||||
echo $(which perl)
|
||||
conda list pyarrow
|
||||
which python3
|
||||
#Put here commands to load other modules (e.g. matlab etc.)
|
||||
#Below command means that parallel_sql will get tasks from the database
|
||||
#and run them on the node (in parallel). So a 16 core node will have
|
||||
#16 tasks running at one time.
|
||||
parallel-sql --sql -a parallel --exit-on-term --jobs 4
|
@ -7,7 +7,6 @@ from itertools import groupby, islice, chain
|
||||
import fire
|
||||
from collections import Counter
|
||||
import os
|
||||
import datetime
|
||||
import re
|
||||
from nltk import wordpunct_tokenize, MWETokenizer, sent_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
@ -31,8 +30,8 @@ def weekly_tf(partition, mwe_pass = 'first'):
|
||||
ngram_output = partition.replace("parquet","txt")
|
||||
|
||||
if mwe_pass == 'first':
|
||||
if os.path.exists(f"/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/{ngram_output}"):
|
||||
os.remove(f"/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/{ngram_output}")
|
||||
if os.path.exists(f"/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}"):
|
||||
os.remove(f"/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}")
|
||||
|
||||
batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author'])
|
||||
|
||||
@ -67,7 +66,7 @@ def weekly_tf(partition, mwe_pass = 'first'):
|
||||
subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week))
|
||||
|
||||
if mwe_pass != 'first':
|
||||
mwe_dataset = pd.read_feather(f'/gscratch/comdata/users/nathante/reddit_multiword_expressions.feather')
|
||||
mwe_dataset = pd.read_feather(f'/gscratch/comdata/output/reddit_ngrams/multiword_expressions.feather')
|
||||
mwe_dataset = mwe_dataset.sort_values(['phrasePWMI'],ascending=False)
|
||||
mwe_phrases = list(mwe_dataset.phrase)
|
||||
mwe_phrases = [tuple(s.split(' ')) for s in mwe_phrases]
|
||||
@ -88,7 +87,6 @@ def weekly_tf(partition, mwe_pass = 'first'):
|
||||
new_sentence.append(new_token)
|
||||
return new_sentence
|
||||
|
||||
|
||||
stopWords = set(stopwords.words('english'))
|
||||
|
||||
# we follow the approach described in datta, phelan, adar 2017
|
||||
@ -121,7 +119,7 @@ def weekly_tf(partition, mwe_pass = 'first'):
|
||||
for sentence in sentences:
|
||||
if random() <= 0.1:
|
||||
grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
|
||||
with open(f'/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
|
||||
with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
|
||||
for ng in grams:
|
||||
gram_file.write(' '.join(ng) + '\n')
|
||||
for token in sentence:
|
||||
@ -156,7 +154,7 @@ def weekly_tf(partition, mwe_pass = 'first'):
|
||||
|
||||
outchunksize = 10000
|
||||
|
||||
with pq.ParquetWriter(f"/gscratch/comdata/users/nathante/reddit_tfidf_test.parquet_temp/{partition}",schema=schema,compression='snappy',flavor='spark') as writer, pq.ParquetWriter(f"/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp/{partition}",schema=author_schema,compression='snappy',flavor='spark') as author_writer:
|
||||
with pq.ParquetWriter(f"/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet/{partition}",schema=schema,compression='snappy',flavor='spark') as writer, pq.ParquetWriter(f"/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet/{partition}",schema=author_schema,compression='snappy',flavor='spark') as author_writer:
|
||||
|
||||
while True:
|
||||
|
21
old/#tfidf_authors.py#
Normal file
21
old/#tfidf_authors.py#
Normal file
@ -0,0 +1,21 @@
|
||||
from pyspark.sql import SparkSession
|
||||
from similarities_helper import build_tfidf_dataset
|
||||
import pandas as pd
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
|
||||
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet")
|
||||
|
||||
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||
|
||||
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||
|
||||
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||
df = df.filter(df.author != '[deleted]')
|
||||
df = df.filter(df.author != 'AutoModerator')
|
||||
|
||||
df = build_tfidf_dataset(df, include_subs, 'author')
|
||||
|
||||
df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf/subreddit_comment_authors.parquet',mode='overwrite',compression='snappy')
|
||||
|
||||
spark.stop()
|
27
old/#tfidf_comments_weekly.py#
Normal file
27
old/#tfidf_comments_weekly.py#
Normal file
@ -0,0 +1,27 @@
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import Window
|
||||
from similarities_helper import build_weekly_tfidf_dataset
|
||||
import pandas as pd
|
||||
|
||||
|
||||
## TODO:need to exclude automoderator / bot posts.
|
||||
## TODO:need to exclude better handle hyperlinks.
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet")
|
||||
|
||||
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||
|
||||
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||
|
||||
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||
# df = df.filter(df.author != '[deleted]')
|
||||
# df = df.filter(df.author != 'AutoModerator')
|
||||
|
||||
df = build_weekly_tfidf_dataset(df, include_subs, 'term')
|
||||
|
||||
|
||||
df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', mode='overwrite', compression='snappy')
|
||||
spark.stop()
|
||||
|
1
old/.#tfidf_authors.py
Symbolic link
1
old/.#tfidf_authors.py
Symbolic link
@ -0,0 +1 @@
|
||||
nathante@n2347.hyak.local.31061:1602221800
|
1
old/.#tfidf_comments_weekly.py
Symbolic link
1
old/.#tfidf_comments_weekly.py
Symbolic link
@ -0,0 +1 @@
|
||||
nathante@n2347.hyak.local.31061:1602221800
|
106
old/author_cosine_similarity.py
Normal file
106
old/author_cosine_similarity.py
Normal file
@ -0,0 +1,106 @@
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import Window
|
||||
import numpy as np
|
||||
import pyarrow
|
||||
import pandas as pd
|
||||
import fire
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from similarities_helper import *
|
||||
|
||||
#tfidf = spark.read.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/subreddit_terms.parquet')
|
||||
def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, included_subreddits = None, topN = 500):
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
conf = spark.sparkContext.getConf()
|
||||
print(outfile)
|
||||
tfidf = spark.read.parquet(tfidf_path)
|
||||
|
||||
if included_subreddits is None:
|
||||
included_subreddits = select_topN_subreddits(topN)
|
||||
|
||||
else:
|
||||
included_subreddits = set(open(included_subreddits))
|
||||
|
||||
print("creating temporary parquet with matrix indicies")
|
||||
tempdir = prep_tfidf_entries_weekly(tfidf, term_colname, min_df, included_subreddits)
|
||||
|
||||
tfidf = spark.read.parquet(tempdir.name)
|
||||
|
||||
# the ids can change each week.
|
||||
subreddit_names = tfidf.select(['subreddit','subreddit_id_new','week']).distinct().toPandas()
|
||||
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||
spark.stop()
|
||||
|
||||
weeks = list(subreddit_names.week.drop_duplicates())
|
||||
for week in weeks:
|
||||
print("loading matrix")
|
||||
mat = read_tfidf_matrix_weekly(tempdir.name, term_colname, week)
|
||||
print('computing similarities')
|
||||
sims = column_similarities(mat)
|
||||
del mat
|
||||
|
||||
names = subreddit_names.loc[subreddit_names.week==week]
|
||||
|
||||
sims = sims.rename({i:sr for i, sr in enumerate(names.subreddit.values)},axis=1)
|
||||
sims['subreddit'] = names.subreddit.values
|
||||
write_weekly_similarities(outfile, sims, week)
|
||||
|
||||
|
||||
|
||||
def cosine_similarities(outfile, min_df = None, included_subreddits=None, topN=500):
|
||||
'''
|
||||
Compute similarities between subreddits based on tfi-idf vectors of author comments
|
||||
|
||||
included_subreddits : string
|
||||
Text file containing a list of subreddits to include (one per line) if included_subreddits is None then do the top 500 subreddits
|
||||
|
||||
min_df : int (default = 0.1 * (number of included_subreddits)
|
||||
exclude terms that appear in fewer than this number of documents.
|
||||
|
||||
outfile: string
|
||||
where to output csv and feather outputs
|
||||
'''
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
conf = spark.sparkContext.getConf()
|
||||
print(outfile)
|
||||
|
||||
tfidf = spark.read.parquet('/gscratch/comdata/output/reddit_similarity/tfidf/subreddit_comment_authors.parquet')
|
||||
|
||||
if included_subreddits is None:
|
||||
included_subreddits = select_topN_subreddits(topN)
|
||||
|
||||
else:
|
||||
included_subreddits = set(open(included_subreddits))
|
||||
|
||||
print("creating temporary parquet with matrix indicies")
|
||||
tempdir = prep_tfidf_entries(tfidf, 'author', min_df, included_subreddits)
|
||||
tfidf = spark.read.parquet(tempdir.name)
|
||||
subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
|
||||
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||
spark.stop()
|
||||
|
||||
print("loading matrix")
|
||||
mat = read_tfidf_matrix(tempdir.name,'author')
|
||||
print('computing similarities')
|
||||
sims = column_similarities(mat)
|
||||
del mat
|
||||
|
||||
sims = pd.DataFrame(sims.todense())
|
||||
sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)},axis=1)
|
||||
sims['subreddit'] = subreddit_names.subreddit.values
|
||||
|
||||
p = Path(outfile)
|
||||
|
||||
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
||||
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
||||
output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
||||
|
||||
sims.to_feather(outfile)
|
||||
tempdir.cleanup()
|
||||
|
||||
if __name__ == '__main__':
|
||||
fire.Fire(author_cosine_similarities)
|
61
old/term_cosine_similarity.py
Normal file
61
old/term_cosine_similarity.py
Normal file
@ -0,0 +1,61 @@
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import Window
|
||||
from pyspark.mllib.linalg.distributed import RowMatrix, CoordinateMatrix
|
||||
import numpy as np
|
||||
import pyarrow
|
||||
import pandas as pd
|
||||
import fire
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from similarities_helper import prep_tfidf_entries, read_tfidf_matrix, column_similarities, select_topN
|
||||
import scipy
|
||||
|
||||
# outfile='test_similarities_500.feather';
|
||||
# min_df = None;
|
||||
# included_subreddits=None; topN=100; exclude_phrases=True;
|
||||
def term_cosine_similarities(outfile, min_df = None, included_subreddits=None, topN=500, exclude_phrases=False):
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
conf = spark.sparkContext.getConf()
|
||||
print(outfile)
|
||||
print(exclude_phrases)
|
||||
|
||||
tfidf = spark.read.parquet('/gscratch/comdata/output/reddit_similarity/tfidf/subreddit_terms.parquet')
|
||||
|
||||
if included_subreddits is None:
|
||||
included_subreddits = select_topN_subreddits(topN)
|
||||
else:
|
||||
included_subreddits = set(open(included_subreddits))
|
||||
|
||||
if exclude_phrases == True:
|
||||
tfidf = tfidf.filter(~f.col(term).contains("_"))
|
||||
|
||||
print("creating temporary parquet with matrix indicies")
|
||||
tempdir = prep_tfidf_entries(tfidf, 'term', min_df, included_subreddits)
|
||||
tfidf = spark.read.parquet(tempdir.name)
|
||||
subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
|
||||
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||
spark.stop()
|
||||
|
||||
print("loading matrix")
|
||||
mat = read_tfidf_matrix(tempdir.name,'term')
|
||||
print('computing similarities')
|
||||
sims = column_similarities(mat)
|
||||
del mat
|
||||
|
||||
sims = pd.DataFrame(sims.todense())
|
||||
sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)},axis=1)
|
||||
sims['subreddit'] = subreddit_names.subreddit.values
|
||||
|
||||
p = Path(outfile)
|
||||
|
||||
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
||||
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
||||
output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
||||
|
||||
sims.to_feather(outfile)
|
||||
tempdir.cleanup()
|
||||
|
||||
if __name__ == '__main__':
|
||||
fire.Fire(term_cosine_similarities)
|
21
old/tfidf_authors.py
Normal file
21
old/tfidf_authors.py
Normal file
@ -0,0 +1,21 @@
|
||||
from pyspark.sql import SparkSession
|
||||
from similarities_helper import build_tfidf_dataset
|
||||
import pandas as pd
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
|
||||
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet")
|
||||
|
||||
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||
|
||||
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||
|
||||
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||
df = df.filter(df.author != '[deleted]')
|
||||
df = df.filter(df.author != 'AutoModerator')
|
||||
|
||||
df = build_tfidf_dataset(df, include_subs, 'author')
|
||||
|
||||
df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf/subreddit_comment_authors.parquet',mode='overwrite',compression='snappy')
|
||||
|
||||
spark.stop()
|
21
old/tfidf_authors_weekly.py
Normal file
21
old/tfidf_authors_weekly.py
Normal file
@ -0,0 +1,21 @@
|
||||
from pyspark.sql import SparkSession
|
||||
from similarities_helper import build_weekly_tfidf_dataset
|
||||
import pandas as pd
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
|
||||
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet")
|
||||
|
||||
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||
|
||||
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||
|
||||
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||
df = df.filter(df.author != '[deleted]')
|
||||
df = df.filter(df.author != 'AutoModerator')
|
||||
|
||||
df = build_weekly_tfidf_dataset(df, include_subs, 'author')
|
||||
|
||||
df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', mode='overwrite', compression='snappy')
|
||||
|
||||
spark.stop()
|
18
old/tfidf_comments.py
Normal file
18
old/tfidf_comments.py
Normal file
@ -0,0 +1,18 @@
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import Window
|
||||
from similarities_helper import build_tfidf_dataset
|
||||
|
||||
## TODO:need to exclude automoderator / bot posts.
|
||||
## TODO:need to exclude better handle hyperlinks.
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet")
|
||||
|
||||
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||
|
||||
df = build_tfidf_dataset(df, include_subs, 'term')
|
||||
|
||||
df.write.parquet('/gscratch/comdata/output/reddit_similarity/reddit_similarity/subreddit_terms.parquet',mode='overwrite',compression='snappy')
|
||||
spark.stop()
|
27
old/tfidf_comments_weekly.py
Normal file
27
old/tfidf_comments_weekly.py
Normal file
@ -0,0 +1,27 @@
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import Window
|
||||
from similarities_helper import build_weekly_tfidf_dataset
|
||||
import pandas as pd
|
||||
|
||||
|
||||
## TODO:need to exclude automoderator / bot posts.
|
||||
## TODO:need to exclude better handle hyperlinks.
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet")
|
||||
|
||||
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||
|
||||
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||
|
||||
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||
# df = df.filter(df.author != '[deleted]')
|
||||
# df = df.filter(df.author != 'AutoModerator')
|
||||
|
||||
df = build_weekly_tfidf_dataset(df, include_subs, 'term')
|
||||
|
||||
|
||||
df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', mode='overwrite', compression='snappy')
|
||||
spark.stop()
|
||||
|
73
similarities/#cosine_similarities.py#
Normal file
73
similarities/#cosine_similarities.py#
Normal file
@ -0,0 +1,73 @@
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
import pandas as pd
|
||||
import fire
|
||||
from pathlib import Path
|
||||
from similarities_helper import prep_tfidf_entries, read_tfidf_matrix, select_topN_subreddits
|
||||
|
||||
|
||||
def cosine_similarities(infile, term_colname, outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False):
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
conf = spark.sparkContext.getConf()
|
||||
print(outfile)
|
||||
print(exclude_phrases)
|
||||
|
||||
tfidf = spark.read.parquet(infile)
|
||||
|
||||
if included_subreddits is None:
|
||||
included_subreddits = select_topN_subreddits(topN)
|
||||
else:
|
||||
included_subreddits = set(open(included_subreddits))
|
||||
|
||||
if exclude_phrases == True:
|
||||
tfidf = tfidf.filter(~f.col(term_colname).contains("_"))
|
||||
|
||||
print("creating temporary parquet with matrix indicies")
|
||||
tempdir = prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits)
|
||||
tfidf = spark.read.parquet(tempdir.name)
|
||||
subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
|
||||
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||
spark.stop()
|
||||
|
||||
print("loading matrix")
|
||||
mat = read_tfidf_matrix(tempdir.name, term_colname)
|
||||
print('computing similarities')
|
||||
sims = column_similarities(mat)
|
||||
del mat
|
||||
|
||||
sims = pd.DataFrame(sims.todense())
|
||||
sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1)
|
||||
sims['subreddit'] = subreddit_names.subreddit.values
|
||||
|
||||
p = Path(outfile)
|
||||
|
||||
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
||||
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
||||
output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
||||
|
||||
sims.to_feather(outfile)
|
||||
tempdir.cleanup()
|
||||
|
||||
def term_cosine_similarities(outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False):
|
||||
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
|
||||
'term',
|
||||
outfile,
|
||||
min_df,
|
||||
included_subreddits,
|
||||
topN,
|
||||
exclude_phrases)
|
||||
|
||||
def author_cosine_similarities(outfile, min_df=2, included_subreddits=None, topN=10000):
|
||||
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
|
||||
'author',
|
||||
outfile,
|
||||
min_df,
|
||||
included_subreddits,
|
||||
topN,
|
||||
exclude_phrases=False)
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire({'term':term_cosine_similarities,
|
||||
'author':author_cosine_similarities})
|
||||
|
24
similarities/#tfidf_weekly.py#
Normal file
24
similarities/#tfidf_weekly.py#
Normal file
@ -0,0 +1,24 @@
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import Window
|
||||
from similarities_helper import build_weekly_tfidf_dataset
|
||||
import pandas as pd
|
||||
|
||||
def tfidf_weekly(inpath, outpath, topN, term_colname, exclude):
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet")
|
||||
|
||||
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||
|
||||
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||
|
||||
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||
# df = df.filter(df.author != '[deleted]')
|
||||
# df = df.filter(df.author != 'AutoModerator')
|
||||
|
||||
df = build_weekly_tfidf_dataset(df, include_subs, 'term')
|
||||
|
||||
|
||||
df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', mode='overwrite', compression='snappy')
|
||||
spark.stop()
|
1
similarities/.#cosine_similarities.py
Symbolic link
1
similarities/.#cosine_similarities.py
Symbolic link
@ -0,0 +1 @@
|
||||
nathante@n2347.hyak.local.31061:1602221800
|
1
similarities/.#tfidf_weekly.py
Symbolic link
1
similarities/.#tfidf_weekly.py
Symbolic link
@ -0,0 +1 @@
|
||||
nathante@n2347.hyak.local.31061:1602221800
|
2
similarities/Makefile
Normal file
2
similarities/Makefile
Normal file
@ -0,0 +1,2 @@
|
||||
/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
|
||||
start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet
|
BIN
similarities/__pycache__/similarities_helper.cpython-37.pyc
Normal file
BIN
similarities/__pycache__/similarities_helper.cpython-37.pyc
Normal file
Binary file not shown.
73
similarities/cosine_similarities.py
Normal file
73
similarities/cosine_similarities.py
Normal file
@ -0,0 +1,73 @@
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
import pandas as pd
|
||||
import fire
|
||||
from pathlib import Path
|
||||
from similarities_helper import prep_tfidf_entries, read_tfidf_matrix, select_topN_subreddits
|
||||
|
||||
|
||||
def cosine_similarities(infile, term_colname, outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False):
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
conf = spark.sparkContext.getConf()
|
||||
print(outfile)
|
||||
print(exclude_phrases)
|
||||
|
||||
tfidf = spark.read.parquet(infile)
|
||||
|
||||
if included_subreddits is None:
|
||||
included_subreddits = select_topN_subreddits(topN)
|
||||
else:
|
||||
included_subreddits = set(open(included_subreddits))
|
||||
|
||||
if exclude_phrases == True:
|
||||
tfidf = tfidf.filter(~f.col(term_colname).contains("_"))
|
||||
|
||||
print("creating temporary parquet with matrix indicies")
|
||||
tempdir = prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits)
|
||||
tfidf = spark.read.parquet(tempdir.name)
|
||||
subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
|
||||
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||
spark.stop()
|
||||
|
||||
print("loading matrix")
|
||||
mat = read_tfidf_matrix(tempdir.name, term_colname)
|
||||
print('computing similarities')
|
||||
sims = column_similarities(mat)
|
||||
del mat
|
||||
|
||||
sims = pd.DataFrame(sims.todense())
|
||||
sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1)
|
||||
sims['subreddit'] = subreddit_names.subreddit.values
|
||||
|
||||
p = Path(outfile)
|
||||
|
||||
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
||||
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
||||
output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
||||
|
||||
sims.to_feather(outfile)
|
||||
tempdir.cleanup()
|
||||
|
||||
def term_cosine_similarities(outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False):
|
||||
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
|
||||
'term',
|
||||
outfile,
|
||||
min_df,
|
||||
included_subreddits,
|
||||
topN,
|
||||
exclude_phrases)
|
||||
|
||||
def author_cosine_similarities(outfile, min_df=2, included_subreddits=None, topN=10000):
|
||||
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
|
||||
'author',
|
||||
outfile,
|
||||
min_df,
|
||||
included_subreddits,
|
||||
topN,
|
||||
exclude_phrases=False)
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire({'term':term_cosine_similarities,
|
||||
'author':author_cosine_similarities})
|
||||
|
4
similarities/job_script.sh
Executable file
4
similarities/job_script.sh
Executable file
@ -0,0 +1,4 @@
|
||||
#!/usr/bin/bash
|
||||
start_spark_cluster.sh
|
||||
spark-submit --master spark://$(hostname):18899 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet
|
||||
stop-all.sh
|
@ -8,11 +8,33 @@ import pyarrow.dataset as ds
|
||||
from scipy.sparse import csr_matrix
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import pathlib
|
||||
|
||||
class tf_weight(Enum):
|
||||
MaxTF = 1
|
||||
Norm05 = 2
|
||||
|
||||
def read_tfidf_matrix_weekly(path, term_colname, week):
|
||||
term = term_colname
|
||||
term_id = term + '_id'
|
||||
term_id_new = term + '_id_new'
|
||||
|
||||
dataset = ds.dataset(path,format='parquet')
|
||||
entries = dataset.to_table(columns=['tf_idf','subreddit_id_new',term_id_new],filter=ds.field('week')==week).to_pandas()
|
||||
return(csr_matrix((entries.tf_idf,(entries[term_id_new]-1, entries.subreddit_id_new-1))))
|
||||
|
||||
def write_weekly_similarities(path, sims, week, names):
|
||||
sims['week'] = week
|
||||
p = pathlib.Path(path)
|
||||
if not p.is_dir():
|
||||
p.mkdir()
|
||||
|
||||
# reformat as a pairwise list
|
||||
sims = sims.melt(id_vars=['subreddit','week'],value_vars=names.subreddit.values)
|
||||
sims.to_parquet(p / week.isoformat())
|
||||
|
||||
|
||||
|
||||
def read_tfidf_matrix(path,term_colname):
|
||||
term = term_colname
|
||||
term_id = term + '_id'
|
||||
@ -29,6 +51,41 @@ def column_similarities(mat):
|
||||
return(sims)
|
||||
|
||||
|
||||
def prep_tfidf_entries_weekly(tfidf, term_colname, min_df, included_subreddits):
|
||||
term = term_colname
|
||||
term_id = term + '_id'
|
||||
term_id_new = term + '_id_new'
|
||||
|
||||
if min_df is None:
|
||||
min_df = 0.1 * len(included_subreddits)
|
||||
|
||||
tfidf = tfidf.filter(f.col("subreddit").isin(included_subreddits))
|
||||
|
||||
# we might not have the same terms or subreddits each week, so we need to make unique ids for each week.
|
||||
sub_ids = tfidf.select(['subreddit_id','week']).distinct()
|
||||
sub_ids = sub_ids.withColumn("subreddit_id_new",f.row_number().over(Window.partitionBy('week').orderBy("subreddit_id")))
|
||||
tfidf = tfidf.join(sub_ids,['subreddit_id','week'])
|
||||
|
||||
# only use terms in at least min_df included subreddits in a given week
|
||||
new_count = tfidf.groupBy([term_id,'week']).agg(f.count(term_id).alias('new_count'))
|
||||
tfidf = tfidf.join(new_count,[term_id,'week'],how='inner')
|
||||
|
||||
# reset the term ids
|
||||
term_ids = tfidf.select([term_id,'week']).distinct()
|
||||
term_ids = term_ids.withColumn(term_id_new,f.row_number().over(Window.partitionBy('week').orderBy(term_id)))
|
||||
tfidf = tfidf.join(term_ids,[term_id,'week'])
|
||||
|
||||
tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old")
|
||||
tfidf = tfidf.withColumn("tf_idf", (tfidf.relative_tf * tfidf.idf).cast('float'))
|
||||
|
||||
tempdir =TemporaryDirectory(suffix='.parquet',prefix='term_tfidf_entries',dir='.')
|
||||
|
||||
tfidf = tfidf.repartition('week')
|
||||
|
||||
tfidf.write.parquet(tempdir.name,mode='overwrite',compression='snappy')
|
||||
return(tempdir)
|
||||
|
||||
|
||||
def prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits):
|
||||
term = term_colname
|
||||
term_id = term + '_id'
|
||||
@ -46,7 +103,6 @@ def prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits):
|
||||
|
||||
# only use terms in at least min_df included subreddits
|
||||
new_count = tfidf.groupBy(term_id).agg(f.count(term_id).alias('new_count'))
|
||||
# new_count = new_count.filter(f.col('new_count') >= min_df)
|
||||
tfidf = tfidf.join(new_count,term_id,how='inner')
|
||||
|
||||
# reset the term ids
|
||||
@ -55,8 +111,6 @@ def prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits):
|
||||
tfidf = tfidf.join(term_ids,term_id)
|
||||
|
||||
tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old")
|
||||
# tfidf = tfidf.withColumnRenamed("idf","idf_old")
|
||||
# tfidf = tfidf.withColumn("idf",f.log(25000/f.col("count")))
|
||||
tfidf = tfidf.withColumn("tf_idf", (tfidf.relative_tf * tfidf.idf).cast('float'))
|
||||
|
||||
tempdir =TemporaryDirectory(suffix='.parquet',prefix='term_tfidf_entries',dir='.')
|
||||
@ -64,7 +118,9 @@ def prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits):
|
||||
tfidf.write.parquet(tempdir.name,mode='overwrite',compression='snappy')
|
||||
return tempdir
|
||||
|
||||
def cosine_similarities(tfidf, term_colname, min_df, included_subreddits, similarity_threshold):
|
||||
|
||||
# try computing cosine similarities using spark
|
||||
def spark_cosine_similarities(tfidf, term_colname, min_df, included_subreddits, similarity_threshold):
|
||||
term = term_colname
|
||||
term_id = term + '_id'
|
||||
term_id_new = term + '_id_new'
|
||||
@ -82,7 +138,6 @@ def cosine_similarities(tfidf, term_colname, min_df, included_subreddits, simila
|
||||
|
||||
# only use terms in at least min_df included subreddits
|
||||
new_count = tfidf.groupBy(term_id).agg(f.count(term_id).alias('new_count'))
|
||||
# new_count = new_count.filter(f.col('new_count') >= min_df)
|
||||
tfidf = tfidf.join(new_count,term_id,how='inner')
|
||||
|
||||
# reset the term ids
|
||||
@ -91,14 +146,10 @@ def cosine_similarities(tfidf, term_colname, min_df, included_subreddits, simila
|
||||
tfidf = tfidf.join(term_ids,term_id)
|
||||
|
||||
tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old")
|
||||
# tfidf = tfidf.withColumnRenamed("idf","idf_old")
|
||||
# tfidf = tfidf.withColumn("idf",f.log(25000/f.col("count")))
|
||||
tfidf = tfidf.withColumn("tf_idf", tfidf.relative_tf * tfidf.idf)
|
||||
|
||||
# step 1 make an rdd of entires
|
||||
# sorted by (dense) spark subreddit id
|
||||
# entries = tfidf.filter((f.col('subreddit') == 'asoiaf') | (f.col('subreddit') == 'gameofthrones') | (f.col('subreddit') == 'christianity')).select(f.col("term_id_new")-1,f.col("subreddit_id_new")-1,"tf_idf").rdd
|
||||
|
||||
n_partitions = int(len(included_subreddits)*2 / 5)
|
||||
|
||||
entries = tfidf.select(f.col(term_id_new)-1,f.col("subreddit_id_new")-1,"tf_idf").rdd.repartition(n_partitions)
|
||||
@ -214,7 +265,6 @@ def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm
|
||||
df = df.join(idf, on=[term_id, term])
|
||||
|
||||
# agg terms by subreddit to make sparse tf/df vectors
|
||||
|
||||
if tf_family == tf_weight.MaxTF:
|
||||
df = df.withColumn("tf_idf", df.relative_tf * df.idf)
|
||||
else: # tf_fam = tf_weight.Norm05
|
||||
@ -222,4 +272,7 @@ def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv"):
|
||||
rankdf = pd.read_csv(path)
|
||||
included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
|
||||
return included_subreddits
|
73
similarities/tfidf.py
Normal file
73
similarities/tfidf.py
Normal file
@ -0,0 +1,73 @@
|
||||
import fire
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import functions as f
|
||||
from similarities_helper import build_tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits
|
||||
|
||||
|
||||
def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude):
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
|
||||
df = spark.read.parquet(inpath)
|
||||
|
||||
df = df.filter(~ f.col(term_colname).isin(exclude))
|
||||
|
||||
include_subs = select_topN_subreddits(topN)
|
||||
|
||||
df = func(df, include_subs, term_colname)
|
||||
|
||||
df.write.parquet(outpath,mode='overwrite',compression='snappy')
|
||||
|
||||
spark.stop()
|
||||
|
||||
def tfidf(inpath, outpath, topN, term_colname, exclude):
|
||||
return _tfidf_wrapper(build_tfidf_dataset, inpath, outpath, topN, term_colname, exclude)
|
||||
|
||||
def tfidf_weekly(inpath, outpath, topN, term_colname, exclude):
|
||||
return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude)
|
||||
|
||||
def tfidf_authors(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
|
||||
topN=25000):
|
||||
|
||||
return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
|
||||
outpath,
|
||||
topN,
|
||||
'author',
|
||||
['[deleted]','AutoModerator']
|
||||
)
|
||||
|
||||
def tfidf_terms(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
|
||||
topN=25000):
|
||||
|
||||
return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
|
||||
outpath,
|
||||
topN,
|
||||
'term',
|
||||
[]
|
||||
)
|
||||
|
||||
def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
|
||||
topN=25000):
|
||||
|
||||
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
|
||||
outpath,
|
||||
topN,
|
||||
'author',
|
||||
['[deleted]','AutoModerator']
|
||||
)
|
||||
|
||||
def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
|
||||
topN=25000):
|
||||
|
||||
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
|
||||
outpath,
|
||||
topN,
|
||||
'term',
|
||||
[]
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire({'authors':tfidf_authors,
|
||||
'terms':tfidf_terms,
|
||||
'authors_weekly':tfidf_authors_weekly,
|
||||
'terms_weekly':tfidf_terms_weekly})
|
@ -1,14 +1,6 @@
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import Window
|
||||
from pyspark.mllib.linalg.distributed import RowMatrix, CoordinateMatrix
|
||||
import numpy as np
|
||||
import pyarrow
|
||||
import pandas as pd
|
||||
import fire
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from similarities_helper import cosine_similarities
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
conf = spark.sparkContext.getConf()
|
||||
@ -27,4 +19,4 @@ df = df.toPandas()
|
||||
|
||||
df = df.sort_values("n_comments")
|
||||
|
||||
df.to_csv('/gscratch/comdata/users/nathante/cdsc-reddit/subreddits_by_num_comments.csv',index=False)
|
||||
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False)
|
73
similarities/weekly_cosine_similarities.py
Normal file
73
similarities/weekly_cosine_similarities.py
Normal file
@ -0,0 +1,73 @@
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import Window
|
||||
import numpy as np
|
||||
import pyarrow
|
||||
import pandas as pd
|
||||
import fire
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from similarities_helper import *
|
||||
|
||||
|
||||
#tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_weekly.parquet')
|
||||
def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, included_subreddits = None, topN = 500):
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
conf = spark.sparkContext.getConf()
|
||||
print(outfile)
|
||||
tfidf = spark.read.parquet(tfidf_path)
|
||||
|
||||
if included_subreddits is None:
|
||||
included_subreddits = select_topN_subreddits(topN)
|
||||
else:
|
||||
included_subreddits = set(open(included_subreddits))
|
||||
|
||||
print(f"computing weekly similarities for {len(included_subreddits)} subreddits")
|
||||
|
||||
print("creating temporary parquet with matrix indicies")
|
||||
tempdir = prep_tfidf_entries_weekly(tfidf, term_colname, min_df, included_subreddits)
|
||||
|
||||
tfidf = spark.read.parquet(tempdir.name)
|
||||
|
||||
# the ids can change each week.
|
||||
subreddit_names = tfidf.select(['subreddit','subreddit_id_new','week']).distinct().toPandas()
|
||||
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||
spark.stop()
|
||||
|
||||
weeks = list(subreddit_names.week.drop_duplicates())
|
||||
for week in weeks:
|
||||
print(f"loading matrix: {week}")
|
||||
mat = read_tfidf_matrix_weekly(tempdir.name, term_colname, week)
|
||||
print('computing similarities')
|
||||
sims = column_similarities(mat)
|
||||
del mat
|
||||
|
||||
names = subreddit_names.loc[subreddit_names.week == week]
|
||||
sims = pd.DataFrame(sims.todense())
|
||||
|
||||
sims = sims.rename({i: sr for i, sr in enumerate(names.subreddit.values)}, axis=1)
|
||||
sims['subreddit'] = names.subreddit.values
|
||||
|
||||
write_weekly_similarities(outfile, sims, week, names)
|
||||
|
||||
|
||||
def author_cosine_similarities_weekly(outfile, min_df=None , included_subreddits=None, topN=500):
|
||||
return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
|
||||
outfile,
|
||||
'author',
|
||||
min_df,
|
||||
included_subreddits,
|
||||
topN)
|
||||
|
||||
def term_cosine_similarities_weekly(outfile, min_df=None, included_subreddits=None, topN=500):
|
||||
return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
|
||||
outfile,
|
||||
'term',
|
||||
min_df,
|
||||
included_subreddits,
|
||||
topN)
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire({'author':author_cosine_similarities_weekly,
|
||||
'term':term_cosine_similarities_weekly})
|
@ -1,127 +0,0 @@
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import Window
|
||||
from pyspark.mllib.linalg.distributed import RowMatrix, CoordinateMatrix
|
||||
import numpy as np
|
||||
import pyarrow
|
||||
import pandas as pd
|
||||
import fire
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from similarities_helper import cosine_similarities, prep_tfidf_entries, read_tfidf_matrix, column_similarities
|
||||
import scipy
|
||||
# outfile='test_similarities_500.feather';
|
||||
# min_df = None;
|
||||
# included_subreddits=None; topN=100; exclude_phrases=True;
|
||||
|
||||
def term_cosine_similarities(outfile, min_df = None, included_subreddits=None, topN=500, exclude_phrases=False):
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
conf = spark.sparkContext.getConf()
|
||||
print(outfile)
|
||||
print(exclude_phrases)
|
||||
|
||||
tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf.parquet')
|
||||
|
||||
if included_subreddits is None:
|
||||
rankdf = pd.read_csv("/gscratch/comdata/users/nathante/cdsc-reddit/subreddits_by_num_comments.csv")
|
||||
included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
|
||||
|
||||
else:
|
||||
included_subreddits = set(open(included_subreddits))
|
||||
|
||||
if exclude_phrases == True:
|
||||
tfidf = tfidf.filter(~f.col(term).contains("_"))
|
||||
|
||||
print("creating temporary parquet with matrix indicies")
|
||||
tempdir = prep_tfidf_entries(tfidf, 'term', min_df, included_subreddits)
|
||||
tfidf = spark.read.parquet(tempdir.name)
|
||||
subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
|
||||
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||
spark.stop()
|
||||
|
||||
print("loading matrix")
|
||||
mat = read_tfidf_matrix(tempdir.name,'term')
|
||||
print('computing similarities')
|
||||
sims = column_similarities(mat)
|
||||
del mat
|
||||
|
||||
sims = pd.DataFrame(sims.todense())
|
||||
sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)},axis=1)
|
||||
sims['subreddit'] = subreddit_names.subreddit.values
|
||||
|
||||
p = Path(outfile)
|
||||
|
||||
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
||||
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
||||
output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
||||
|
||||
sims.to_feather(outfile)
|
||||
tempdir.cleanup()
|
||||
path = "term_tfidf_entriesaukjy5gv.parquet"
|
||||
|
||||
|
||||
# outfile = '/gscratch/comdata/users/nathante/test_similarities_500.feather'; min_df = None; included_subreddits=None; similarity_threshold=0;
|
||||
# def term_cosine_similarities(outfile, min_df = None, included_subreddits=None, similarity_threshold=0, topN=500, exclude_phrases=True):
|
||||
# '''
|
||||
# Compute similarities between subreddits based on tfi-idf vectors of comment texts
|
||||
|
||||
# included_subreddits : string
|
||||
# Text file containing a list of subreddits to include (one per line) if included_subreddits is None then do the top 500 subreddits
|
||||
|
||||
# similarity_threshold : double (default = 0)
|
||||
# set > 0 for large numbers of subreddits to get an approximate solution using the DIMSUM algorithm
|
||||
# https://stanford.edu/~rezab/papers/dimsum.pdf. If similarity_threshold=0 we get an exact solution using an O(N^2) algorithm.
|
||||
|
||||
# min_df : int (default = 0.1 * (number of included_subreddits)
|
||||
# exclude terms that appear in fewer than this number of documents.
|
||||
|
||||
# outfile: string
|
||||
# where to output csv and feather outputs
|
||||
# '''
|
||||
|
||||
# print(outfile)
|
||||
# print(exclude_phrases)
|
||||
|
||||
# tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf.parquet')
|
||||
|
||||
# if included_subreddits is None:
|
||||
# included_subreddits = list(islice(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"),topN))
|
||||
# included_subreddits = {s.strip('\n') for s in included_subreddits}
|
||||
|
||||
# else:
|
||||
# included_subreddits = set(open(included_subreddits))
|
||||
|
||||
# if exclude_phrases == True:
|
||||
# tfidf = tfidf.filter(~f.col(term).contains("_"))
|
||||
|
||||
# sim_dist, tfidf = cosine_similarities(tfidf, 'term', min_df, included_subreddits, similarity_threshold)
|
||||
|
||||
# p = Path(outfile)
|
||||
|
||||
# output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
||||
# output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
||||
# output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
||||
|
||||
# sim_dist.entries.toDF().write.parquet(str(output_parquet),mode='overwrite',compression='snappy')
|
||||
|
||||
# #instead of toLocalMatrix() why not read as entries and put strait into numpy
|
||||
# sim_entries = pd.read_parquet(output_parquet)
|
||||
|
||||
# df = tfidf.select('subreddit','subreddit_id_new').distinct().toPandas()
|
||||
# spark.stop()
|
||||
# df['subreddit_id_new'] = df['subreddit_id_new'] - 1
|
||||
# df = df.sort_values('subreddit_id_new').reset_index(drop=True)
|
||||
# df = df.set_index('subreddit_id_new')
|
||||
|
||||
# similarities = sim_entries.join(df, on='i')
|
||||
# similarities = similarities.rename(columns={'subreddit':"subreddit_i"})
|
||||
# similarities = similarities.join(df, on='j')
|
||||
# similarities = similarities.rename(columns={'subreddit':"subreddit_j"})
|
||||
|
||||
# similarities.to_feather(output_feather)
|
||||
# similarities.to_csv(output_csv)
|
||||
# return similarities
|
||||
|
||||
if __name__ == '__main__':
|
||||
fire.Fire(term_cosine_similarities)
|
@ -1,21 +0,0 @@
|
||||
from pyspark.sql import SparkSession
|
||||
from similarities_helper import build_tfidf_dataset
|
||||
import pandas as pd
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
|
||||
df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp")
|
||||
|
||||
include_subs = pd.read_csv("/gscratch/comdata/users/nathante/cdsc-reddit/subreddits_by_num_comments.csv")
|
||||
|
||||
#include_subs = set(include_subs.loc[include_subs.comments_rank < 300]['subreddit'])
|
||||
|
||||
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||
df = df.filter(df.author != '[deleted]')
|
||||
df = df.filter(df.author != 'AutoModerator')
|
||||
|
||||
df = build_tfidf_dataset(df, include_subs, 'author')
|
||||
|
||||
df.write.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_authors.parquet',mode='overwrite',compression='snappy')
|
||||
|
||||
spark.stop()
|
@ -1,18 +0,0 @@
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import Window
|
||||
from similarities_helper import build_tfidf_dataset
|
||||
|
||||
## TODO:need to exclude automoderator / bot posts.
|
||||
## TODO:need to exclude better handle hyperlinks.
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test.parquet_temp")
|
||||
|
||||
include_subs = set(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"))
|
||||
include_subs = {s.strip('\n') for s in include_subs}
|
||||
|
||||
df = build_tfidf_dataset(df, include_subs, 'term')
|
||||
|
||||
df.write.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf.parquet',mode='overwrite',compression='snappy')
|
||||
spark.stop()
|
Loading…
Reference in New Issue
Block a user