Refactor and reorganze.
This commit is contained in:
parent
a60747292e
commit
e6294b5b90
@ -1,119 +0,0 @@
|
|||||||
from pyspark.sql import functions as f
|
|
||||||
from pyspark.sql import SparkSession
|
|
||||||
from pyspark.sql import Window
|
|
||||||
import numpy as np
|
|
||||||
import pyarrow
|
|
||||||
import pandas as pd
|
|
||||||
import fire
|
|
||||||
from itertools import islice
|
|
||||||
from pathlib import Path
|
|
||||||
from similarities_helper import cosine_similarities, prep_tfidf_entries, read_tfidf_matrix, column_similarities
|
|
||||||
|
|
||||||
spark = SparkSession.builder.getOrCreate()
|
|
||||||
conf = spark.sparkContext.getConf()
|
|
||||||
|
|
||||||
# outfile = '/gscratch/comdata/users/nathante/test_similarities_500.feather'; min_df = None; included_subreddits=None; similarity_threshold=0;
|
|
||||||
def author_cosine_similarities(outfile, min_df = None, included_subreddits=None, similarity_threshold=0, topN=500):
|
|
||||||
'''
|
|
||||||
Compute similarities between subreddits based on tfi-idf vectors of author comments
|
|
||||||
|
|
||||||
included_subreddits : string
|
|
||||||
Text file containing a list of subreddits to include (one per line) if included_subreddits is None then do the top 500 subreddits
|
|
||||||
|
|
||||||
similarity_threshold : double (default = 0)
|
|
||||||
set > 0 for large numbers of subreddits to get an approximate solution using the DIMSUM algorithm
|
|
||||||
https://stanford.edu/~rezab/papers/dimsum.pdf. If similarity_threshold=0 we get an exact solution using an O(N^2) algorithm.
|
|
||||||
|
|
||||||
min_df : int (default = 0.1 * (number of included_subreddits)
|
|
||||||
exclude terms that appear in fewer than this number of documents.
|
|
||||||
|
|
||||||
outfile: string
|
|
||||||
where to output csv and feather outputs
|
|
||||||
'''
|
|
||||||
|
|
||||||
spark = SparkSession.builder.getOrCreate()
|
|
||||||
conf = spark.sparkContext.getConf()
|
|
||||||
print(outfile)
|
|
||||||
|
|
||||||
tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_authors.parquet')
|
|
||||||
|
|
||||||
if included_subreddits is None:
|
|
||||||
rankdf = pd.read_csv("/gscratch/comdata/users/nathante/cdsc-reddit/subreddits_by_num_comments.csv")
|
|
||||||
included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
|
|
||||||
|
|
||||||
else:
|
|
||||||
included_subreddits = set(open(included_subreddits))
|
|
||||||
|
|
||||||
print("creating temporary parquet with matrix indicies")
|
|
||||||
tempdir = prep_tfidf_entries(tfidf, 'author', min_df, included_subreddits)
|
|
||||||
tfidf = spark.read.parquet(tempdir.name)
|
|
||||||
subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
|
|
||||||
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
|
||||||
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
|
||||||
spark.stop()
|
|
||||||
|
|
||||||
print("loading matrix")
|
|
||||||
mat = read_tfidf_matrix(tempdir.name,'author')
|
|
||||||
print('computing similarities')
|
|
||||||
sims = column_similarities(mat)
|
|
||||||
del mat
|
|
||||||
|
|
||||||
sims = pd.DataFrame(sims.todense())
|
|
||||||
sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)},axis=1)
|
|
||||||
sims['subreddit'] = subreddit_names.subreddit.values
|
|
||||||
|
|
||||||
p = Path(outfile)
|
|
||||||
|
|
||||||
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
|
||||||
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
|
||||||
output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
|
||||||
|
|
||||||
sims.to_feather(outfile)
|
|
||||||
tempdir.cleanup()
|
|
||||||
|
|
||||||
# print(outfile)
|
|
||||||
|
|
||||||
# tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_authors.parquet')
|
|
||||||
|
|
||||||
# if included_subreddits is None:
|
|
||||||
# included_subreddits = list(islice(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"),topN))
|
|
||||||
# included_subreddits = {s.strip('\n') for s in included_subreddits}
|
|
||||||
|
|
||||||
# else:
|
|
||||||
# included_subreddits = set(open(included_subreddits))
|
|
||||||
|
|
||||||
# sim_dist, tfidf = cosine_similarities(tfidf, 'author', min_df, included_subreddits, similarity_threshold)
|
|
||||||
|
|
||||||
# p = Path(outfile)
|
|
||||||
|
|
||||||
# output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
|
||||||
# output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
|
||||||
# output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
|
||||||
# sim_dist = sim_dist.entries.toDF()
|
|
||||||
|
|
||||||
# sim_dist = sim_dist.repartition(1)
|
|
||||||
# sim_dist.write.parquet(str(output_parquet),mode='overwrite',compression='snappy')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# #instead of toLocalMatrix() why not read as entries and put strait into numpy
|
|
||||||
# sim_entries = pd.read_parquet(output_parquet)
|
|
||||||
|
|
||||||
# df = tfidf.select('subreddit','subreddit_id_new').distinct().toPandas()
|
|
||||||
|
|
||||||
# spark.stop()
|
|
||||||
# df['subreddit_id_new'] = df['subreddit_id_new'] - 1
|
|
||||||
# df = df.sort_values('subreddit_id_new').reset_index(drop=True)
|
|
||||||
# df = df.set_index('subreddit_id_new')
|
|
||||||
|
|
||||||
# similarities = sim_entries.join(df, on='i')
|
|
||||||
# similarities = similarities.rename(columns={'subreddit':"subreddit_i"})
|
|
||||||
# similarities = similarities.join(df, on='j')
|
|
||||||
# similarities = similarities.rename(columns={'subreddit':"subreddit_j"})
|
|
||||||
|
|
||||||
# similarities.to_feather(output_feather)
|
|
||||||
# similarities.to_csv(output_csv)
|
|
||||||
# return similarities
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
fire.Fire(author_cosine_similarities)
|
|
@ -16,6 +16,8 @@ def affinity_clustering(similarities, output, damping=0.5, max_iter=100000, conv
|
|||||||
|
|
||||||
preference = np.quantile(mat,preference_quantile)
|
preference = np.quantile(mat,preference_quantile)
|
||||||
|
|
||||||
|
print("data loaded")
|
||||||
|
|
||||||
clustering = AffinityPropagation(damping=damping,
|
clustering = AffinityPropagation(damping=damping,
|
||||||
max_iter=max_iter,
|
max_iter=max_iter,
|
||||||
convergence_iter=convergence_iter,
|
convergence_iter=convergence_iter,
|
4
datasets/job_script.sh
Executable file
4
datasets/job_script.sh
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
#!/usr/bin/bash
|
||||||
|
start_spark_cluster.sh
|
||||||
|
spark-submit --master spark://$(hostname):18899 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/users/nathante/subreddit_term_similarity_weekly_5000.parquet --topN=5000
|
||||||
|
stop-all.sh
|
0
ngrams/#ngrams_helper.py#
Normal file
0
ngrams/#ngrams_helper.py#
Normal file
26
ngrams/checkpoint_parallelsql.sbatch
Normal file
26
ngrams/checkpoint_parallelsql.sbatch
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
## parallel_sql_job.sh
|
||||||
|
#SBATCH --job-name=tf_subreddit_comments
|
||||||
|
## Allocation Definition
|
||||||
|
#SBATCH --account=comdata-ckpt
|
||||||
|
#SBATCH --partition=ckpt
|
||||||
|
## Resources
|
||||||
|
## Nodes. This should always be 1 for parallel-sql.
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
## Walltime (12 hours)
|
||||||
|
#SBATCH --time=12:00:00
|
||||||
|
## Memory per node
|
||||||
|
#SBATCH --mem=32G
|
||||||
|
#SBATCH --cpus-per-task=4
|
||||||
|
#SBATCH --ntasks=1
|
||||||
|
#SBATCH -D /gscratch/comdata/users/nathante/cdsc-reddit
|
||||||
|
source ./bin/activate
|
||||||
|
module load parallel_sql
|
||||||
|
echo $(which perl)
|
||||||
|
conda list pyarrow
|
||||||
|
which python3
|
||||||
|
#Put here commands to load other modules (e.g. matlab etc.)
|
||||||
|
#Below command means that parallel_sql will get tasks from the database
|
||||||
|
#and run them on the node (in parallel). So a 16 core node will have
|
||||||
|
#16 tasks running at one time.
|
||||||
|
parallel-sql --sql -a parallel --exit-on-term --jobs 4
|
@ -7,7 +7,6 @@ from itertools import groupby, islice, chain
|
|||||||
import fire
|
import fire
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import os
|
import os
|
||||||
import datetime
|
|
||||||
import re
|
import re
|
||||||
from nltk import wordpunct_tokenize, MWETokenizer, sent_tokenize
|
from nltk import wordpunct_tokenize, MWETokenizer, sent_tokenize
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
@ -31,8 +30,8 @@ def weekly_tf(partition, mwe_pass = 'first'):
|
|||||||
ngram_output = partition.replace("parquet","txt")
|
ngram_output = partition.replace("parquet","txt")
|
||||||
|
|
||||||
if mwe_pass == 'first':
|
if mwe_pass == 'first':
|
||||||
if os.path.exists(f"/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/{ngram_output}"):
|
if os.path.exists(f"/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}"):
|
||||||
os.remove(f"/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/{ngram_output}")
|
os.remove(f"/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}")
|
||||||
|
|
||||||
batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author'])
|
batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author'])
|
||||||
|
|
||||||
@ -67,7 +66,7 @@ def weekly_tf(partition, mwe_pass = 'first'):
|
|||||||
subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week))
|
subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week))
|
||||||
|
|
||||||
if mwe_pass != 'first':
|
if mwe_pass != 'first':
|
||||||
mwe_dataset = pd.read_feather(f'/gscratch/comdata/users/nathante/reddit_multiword_expressions.feather')
|
mwe_dataset = pd.read_feather(f'/gscratch/comdata/output/reddit_ngrams/multiword_expressions.feather')
|
||||||
mwe_dataset = mwe_dataset.sort_values(['phrasePWMI'],ascending=False)
|
mwe_dataset = mwe_dataset.sort_values(['phrasePWMI'],ascending=False)
|
||||||
mwe_phrases = list(mwe_dataset.phrase)
|
mwe_phrases = list(mwe_dataset.phrase)
|
||||||
mwe_phrases = [tuple(s.split(' ')) for s in mwe_phrases]
|
mwe_phrases = [tuple(s.split(' ')) for s in mwe_phrases]
|
||||||
@ -88,7 +87,6 @@ def weekly_tf(partition, mwe_pass = 'first'):
|
|||||||
new_sentence.append(new_token)
|
new_sentence.append(new_token)
|
||||||
return new_sentence
|
return new_sentence
|
||||||
|
|
||||||
|
|
||||||
stopWords = set(stopwords.words('english'))
|
stopWords = set(stopwords.words('english'))
|
||||||
|
|
||||||
# we follow the approach described in datta, phelan, adar 2017
|
# we follow the approach described in datta, phelan, adar 2017
|
||||||
@ -121,7 +119,7 @@ def weekly_tf(partition, mwe_pass = 'first'):
|
|||||||
for sentence in sentences:
|
for sentence in sentences:
|
||||||
if random() <= 0.1:
|
if random() <= 0.1:
|
||||||
grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
|
grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
|
||||||
with open(f'/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
|
with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
|
||||||
for ng in grams:
|
for ng in grams:
|
||||||
gram_file.write(' '.join(ng) + '\n')
|
gram_file.write(' '.join(ng) + '\n')
|
||||||
for token in sentence:
|
for token in sentence:
|
||||||
@ -156,7 +154,7 @@ def weekly_tf(partition, mwe_pass = 'first'):
|
|||||||
|
|
||||||
outchunksize = 10000
|
outchunksize = 10000
|
||||||
|
|
||||||
with pq.ParquetWriter(f"/gscratch/comdata/users/nathante/reddit_tfidf_test.parquet_temp/{partition}",schema=schema,compression='snappy',flavor='spark') as writer, pq.ParquetWriter(f"/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp/{partition}",schema=author_schema,compression='snappy',flavor='spark') as author_writer:
|
with pq.ParquetWriter(f"/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet/{partition}",schema=schema,compression='snappy',flavor='spark') as writer, pq.ParquetWriter(f"/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet/{partition}",schema=author_schema,compression='snappy',flavor='spark') as author_writer:
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
21
old/#tfidf_authors.py#
Normal file
21
old/#tfidf_authors.py#
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from similarities_helper import build_tfidf_dataset
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
|
||||||
|
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet")
|
||||||
|
|
||||||
|
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||||
|
|
||||||
|
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||||
|
|
||||||
|
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||||
|
df = df.filter(df.author != '[deleted]')
|
||||||
|
df = df.filter(df.author != 'AutoModerator')
|
||||||
|
|
||||||
|
df = build_tfidf_dataset(df, include_subs, 'author')
|
||||||
|
|
||||||
|
df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf/subreddit_comment_authors.parquet',mode='overwrite',compression='snappy')
|
||||||
|
|
||||||
|
spark.stop()
|
27
old/#tfidf_comments_weekly.py#
Normal file
27
old/#tfidf_comments_weekly.py#
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
from pyspark.sql import functions as f
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from pyspark.sql import Window
|
||||||
|
from similarities_helper import build_weekly_tfidf_dataset
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
## TODO:need to exclude automoderator / bot posts.
|
||||||
|
## TODO:need to exclude better handle hyperlinks.
|
||||||
|
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet")
|
||||||
|
|
||||||
|
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||||
|
|
||||||
|
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||||
|
|
||||||
|
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||||
|
# df = df.filter(df.author != '[deleted]')
|
||||||
|
# df = df.filter(df.author != 'AutoModerator')
|
||||||
|
|
||||||
|
df = build_weekly_tfidf_dataset(df, include_subs, 'term')
|
||||||
|
|
||||||
|
|
||||||
|
df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', mode='overwrite', compression='snappy')
|
||||||
|
spark.stop()
|
||||||
|
|
1
old/.#tfidf_authors.py
Symbolic link
1
old/.#tfidf_authors.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
nathante@n2347.hyak.local.31061:1602221800
|
1
old/.#tfidf_comments_weekly.py
Symbolic link
1
old/.#tfidf_comments_weekly.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
nathante@n2347.hyak.local.31061:1602221800
|
106
old/author_cosine_similarity.py
Normal file
106
old/author_cosine_similarity.py
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
from pyspark.sql import functions as f
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from pyspark.sql import Window
|
||||||
|
import numpy as np
|
||||||
|
import pyarrow
|
||||||
|
import pandas as pd
|
||||||
|
import fire
|
||||||
|
from itertools import islice
|
||||||
|
from pathlib import Path
|
||||||
|
from similarities_helper import *
|
||||||
|
|
||||||
|
#tfidf = spark.read.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/subreddit_terms.parquet')
|
||||||
|
def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, included_subreddits = None, topN = 500):
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
conf = spark.sparkContext.getConf()
|
||||||
|
print(outfile)
|
||||||
|
tfidf = spark.read.parquet(tfidf_path)
|
||||||
|
|
||||||
|
if included_subreddits is None:
|
||||||
|
included_subreddits = select_topN_subreddits(topN)
|
||||||
|
|
||||||
|
else:
|
||||||
|
included_subreddits = set(open(included_subreddits))
|
||||||
|
|
||||||
|
print("creating temporary parquet with matrix indicies")
|
||||||
|
tempdir = prep_tfidf_entries_weekly(tfidf, term_colname, min_df, included_subreddits)
|
||||||
|
|
||||||
|
tfidf = spark.read.parquet(tempdir.name)
|
||||||
|
|
||||||
|
# the ids can change each week.
|
||||||
|
subreddit_names = tfidf.select(['subreddit','subreddit_id_new','week']).distinct().toPandas()
|
||||||
|
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||||
|
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||||
|
spark.stop()
|
||||||
|
|
||||||
|
weeks = list(subreddit_names.week.drop_duplicates())
|
||||||
|
for week in weeks:
|
||||||
|
print("loading matrix")
|
||||||
|
mat = read_tfidf_matrix_weekly(tempdir.name, term_colname, week)
|
||||||
|
print('computing similarities')
|
||||||
|
sims = column_similarities(mat)
|
||||||
|
del mat
|
||||||
|
|
||||||
|
names = subreddit_names.loc[subreddit_names.week==week]
|
||||||
|
|
||||||
|
sims = sims.rename({i:sr for i, sr in enumerate(names.subreddit.values)},axis=1)
|
||||||
|
sims['subreddit'] = names.subreddit.values
|
||||||
|
write_weekly_similarities(outfile, sims, week)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def cosine_similarities(outfile, min_df = None, included_subreddits=None, topN=500):
|
||||||
|
'''
|
||||||
|
Compute similarities between subreddits based on tfi-idf vectors of author comments
|
||||||
|
|
||||||
|
included_subreddits : string
|
||||||
|
Text file containing a list of subreddits to include (one per line) if included_subreddits is None then do the top 500 subreddits
|
||||||
|
|
||||||
|
min_df : int (default = 0.1 * (number of included_subreddits)
|
||||||
|
exclude terms that appear in fewer than this number of documents.
|
||||||
|
|
||||||
|
outfile: string
|
||||||
|
where to output csv and feather outputs
|
||||||
|
'''
|
||||||
|
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
conf = spark.sparkContext.getConf()
|
||||||
|
print(outfile)
|
||||||
|
|
||||||
|
tfidf = spark.read.parquet('/gscratch/comdata/output/reddit_similarity/tfidf/subreddit_comment_authors.parquet')
|
||||||
|
|
||||||
|
if included_subreddits is None:
|
||||||
|
included_subreddits = select_topN_subreddits(topN)
|
||||||
|
|
||||||
|
else:
|
||||||
|
included_subreddits = set(open(included_subreddits))
|
||||||
|
|
||||||
|
print("creating temporary parquet with matrix indicies")
|
||||||
|
tempdir = prep_tfidf_entries(tfidf, 'author', min_df, included_subreddits)
|
||||||
|
tfidf = spark.read.parquet(tempdir.name)
|
||||||
|
subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
|
||||||
|
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||||
|
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||||
|
spark.stop()
|
||||||
|
|
||||||
|
print("loading matrix")
|
||||||
|
mat = read_tfidf_matrix(tempdir.name,'author')
|
||||||
|
print('computing similarities')
|
||||||
|
sims = column_similarities(mat)
|
||||||
|
del mat
|
||||||
|
|
||||||
|
sims = pd.DataFrame(sims.todense())
|
||||||
|
sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)},axis=1)
|
||||||
|
sims['subreddit'] = subreddit_names.subreddit.values
|
||||||
|
|
||||||
|
p = Path(outfile)
|
||||||
|
|
||||||
|
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
||||||
|
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
||||||
|
output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
||||||
|
|
||||||
|
sims.to_feather(outfile)
|
||||||
|
tempdir.cleanup()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
fire.Fire(author_cosine_similarities)
|
61
old/term_cosine_similarity.py
Normal file
61
old/term_cosine_similarity.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
from pyspark.sql import functions as f
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from pyspark.sql import Window
|
||||||
|
from pyspark.mllib.linalg.distributed import RowMatrix, CoordinateMatrix
|
||||||
|
import numpy as np
|
||||||
|
import pyarrow
|
||||||
|
import pandas as pd
|
||||||
|
import fire
|
||||||
|
from itertools import islice
|
||||||
|
from pathlib import Path
|
||||||
|
from similarities_helper import prep_tfidf_entries, read_tfidf_matrix, column_similarities, select_topN
|
||||||
|
import scipy
|
||||||
|
|
||||||
|
# outfile='test_similarities_500.feather';
|
||||||
|
# min_df = None;
|
||||||
|
# included_subreddits=None; topN=100; exclude_phrases=True;
|
||||||
|
def term_cosine_similarities(outfile, min_df = None, included_subreddits=None, topN=500, exclude_phrases=False):
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
conf = spark.sparkContext.getConf()
|
||||||
|
print(outfile)
|
||||||
|
print(exclude_phrases)
|
||||||
|
|
||||||
|
tfidf = spark.read.parquet('/gscratch/comdata/output/reddit_similarity/tfidf/subreddit_terms.parquet')
|
||||||
|
|
||||||
|
if included_subreddits is None:
|
||||||
|
included_subreddits = select_topN_subreddits(topN)
|
||||||
|
else:
|
||||||
|
included_subreddits = set(open(included_subreddits))
|
||||||
|
|
||||||
|
if exclude_phrases == True:
|
||||||
|
tfidf = tfidf.filter(~f.col(term).contains("_"))
|
||||||
|
|
||||||
|
print("creating temporary parquet with matrix indicies")
|
||||||
|
tempdir = prep_tfidf_entries(tfidf, 'term', min_df, included_subreddits)
|
||||||
|
tfidf = spark.read.parquet(tempdir.name)
|
||||||
|
subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
|
||||||
|
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||||
|
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||||
|
spark.stop()
|
||||||
|
|
||||||
|
print("loading matrix")
|
||||||
|
mat = read_tfidf_matrix(tempdir.name,'term')
|
||||||
|
print('computing similarities')
|
||||||
|
sims = column_similarities(mat)
|
||||||
|
del mat
|
||||||
|
|
||||||
|
sims = pd.DataFrame(sims.todense())
|
||||||
|
sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)},axis=1)
|
||||||
|
sims['subreddit'] = subreddit_names.subreddit.values
|
||||||
|
|
||||||
|
p = Path(outfile)
|
||||||
|
|
||||||
|
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
||||||
|
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
||||||
|
output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
||||||
|
|
||||||
|
sims.to_feather(outfile)
|
||||||
|
tempdir.cleanup()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
fire.Fire(term_cosine_similarities)
|
21
old/tfidf_authors.py
Normal file
21
old/tfidf_authors.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from similarities_helper import build_tfidf_dataset
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
|
||||||
|
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet")
|
||||||
|
|
||||||
|
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||||
|
|
||||||
|
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||||
|
|
||||||
|
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||||
|
df = df.filter(df.author != '[deleted]')
|
||||||
|
df = df.filter(df.author != 'AutoModerator')
|
||||||
|
|
||||||
|
df = build_tfidf_dataset(df, include_subs, 'author')
|
||||||
|
|
||||||
|
df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf/subreddit_comment_authors.parquet',mode='overwrite',compression='snappy')
|
||||||
|
|
||||||
|
spark.stop()
|
21
old/tfidf_authors_weekly.py
Normal file
21
old/tfidf_authors_weekly.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from similarities_helper import build_weekly_tfidf_dataset
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
|
||||||
|
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet")
|
||||||
|
|
||||||
|
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||||
|
|
||||||
|
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||||
|
|
||||||
|
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||||
|
df = df.filter(df.author != '[deleted]')
|
||||||
|
df = df.filter(df.author != 'AutoModerator')
|
||||||
|
|
||||||
|
df = build_weekly_tfidf_dataset(df, include_subs, 'author')
|
||||||
|
|
||||||
|
df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', mode='overwrite', compression='snappy')
|
||||||
|
|
||||||
|
spark.stop()
|
18
old/tfidf_comments.py
Normal file
18
old/tfidf_comments.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
from pyspark.sql import functions as f
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from pyspark.sql import Window
|
||||||
|
from similarities_helper import build_tfidf_dataset
|
||||||
|
|
||||||
|
## TODO:need to exclude automoderator / bot posts.
|
||||||
|
## TODO:need to exclude better handle hyperlinks.
|
||||||
|
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet")
|
||||||
|
|
||||||
|
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||||
|
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||||
|
|
||||||
|
df = build_tfidf_dataset(df, include_subs, 'term')
|
||||||
|
|
||||||
|
df.write.parquet('/gscratch/comdata/output/reddit_similarity/reddit_similarity/subreddit_terms.parquet',mode='overwrite',compression='snappy')
|
||||||
|
spark.stop()
|
27
old/tfidf_comments_weekly.py
Normal file
27
old/tfidf_comments_weekly.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
from pyspark.sql import functions as f
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from pyspark.sql import Window
|
||||||
|
from similarities_helper import build_weekly_tfidf_dataset
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
## TODO:need to exclude automoderator / bot posts.
|
||||||
|
## TODO:need to exclude better handle hyperlinks.
|
||||||
|
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet")
|
||||||
|
|
||||||
|
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||||
|
|
||||||
|
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||||
|
|
||||||
|
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||||
|
# df = df.filter(df.author != '[deleted]')
|
||||||
|
# df = df.filter(df.author != 'AutoModerator')
|
||||||
|
|
||||||
|
df = build_weekly_tfidf_dataset(df, include_subs, 'term')
|
||||||
|
|
||||||
|
|
||||||
|
df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', mode='overwrite', compression='snappy')
|
||||||
|
spark.stop()
|
||||||
|
|
73
similarities/#cosine_similarities.py#
Normal file
73
similarities/#cosine_similarities.py#
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
from pyspark.sql import functions as f
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
import pandas as pd
|
||||||
|
import fire
|
||||||
|
from pathlib import Path
|
||||||
|
from similarities_helper import prep_tfidf_entries, read_tfidf_matrix, select_topN_subreddits
|
||||||
|
|
||||||
|
|
||||||
|
def cosine_similarities(infile, term_colname, outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False):
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
conf = spark.sparkContext.getConf()
|
||||||
|
print(outfile)
|
||||||
|
print(exclude_phrases)
|
||||||
|
|
||||||
|
tfidf = spark.read.parquet(infile)
|
||||||
|
|
||||||
|
if included_subreddits is None:
|
||||||
|
included_subreddits = select_topN_subreddits(topN)
|
||||||
|
else:
|
||||||
|
included_subreddits = set(open(included_subreddits))
|
||||||
|
|
||||||
|
if exclude_phrases == True:
|
||||||
|
tfidf = tfidf.filter(~f.col(term_colname).contains("_"))
|
||||||
|
|
||||||
|
print("creating temporary parquet with matrix indicies")
|
||||||
|
tempdir = prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits)
|
||||||
|
tfidf = spark.read.parquet(tempdir.name)
|
||||||
|
subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
|
||||||
|
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||||
|
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||||
|
spark.stop()
|
||||||
|
|
||||||
|
print("loading matrix")
|
||||||
|
mat = read_tfidf_matrix(tempdir.name, term_colname)
|
||||||
|
print('computing similarities')
|
||||||
|
sims = column_similarities(mat)
|
||||||
|
del mat
|
||||||
|
|
||||||
|
sims = pd.DataFrame(sims.todense())
|
||||||
|
sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1)
|
||||||
|
sims['subreddit'] = subreddit_names.subreddit.values
|
||||||
|
|
||||||
|
p = Path(outfile)
|
||||||
|
|
||||||
|
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
||||||
|
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
||||||
|
output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
||||||
|
|
||||||
|
sims.to_feather(outfile)
|
||||||
|
tempdir.cleanup()
|
||||||
|
|
||||||
|
def term_cosine_similarities(outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False):
|
||||||
|
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
|
||||||
|
'term',
|
||||||
|
outfile,
|
||||||
|
min_df,
|
||||||
|
included_subreddits,
|
||||||
|
topN,
|
||||||
|
exclude_phrases)
|
||||||
|
|
||||||
|
def author_cosine_similarities(outfile, min_df=2, included_subreddits=None, topN=10000):
|
||||||
|
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
|
||||||
|
'author',
|
||||||
|
outfile,
|
||||||
|
min_df,
|
||||||
|
included_subreddits,
|
||||||
|
topN,
|
||||||
|
exclude_phrases=False)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
fire.Fire({'term':term_cosine_similarities,
|
||||||
|
'author':author_cosine_similarities})
|
||||||
|
|
24
similarities/#tfidf_weekly.py#
Normal file
24
similarities/#tfidf_weekly.py#
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
from pyspark.sql import functions as f
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from pyspark.sql import Window
|
||||||
|
from similarities_helper import build_weekly_tfidf_dataset
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
def tfidf_weekly(inpath, outpath, topN, term_colname, exclude):
|
||||||
|
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet")
|
||||||
|
|
||||||
|
include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
|
||||||
|
|
||||||
|
include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
|
||||||
|
|
||||||
|
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||||
|
# df = df.filter(df.author != '[deleted]')
|
||||||
|
# df = df.filter(df.author != 'AutoModerator')
|
||||||
|
|
||||||
|
df = build_weekly_tfidf_dataset(df, include_subs, 'term')
|
||||||
|
|
||||||
|
|
||||||
|
df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', mode='overwrite', compression='snappy')
|
||||||
|
spark.stop()
|
1
similarities/.#cosine_similarities.py
Symbolic link
1
similarities/.#cosine_similarities.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
nathante@n2347.hyak.local.31061:1602221800
|
1
similarities/.#tfidf_weekly.py
Symbolic link
1
similarities/.#tfidf_weekly.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
nathante@n2347.hyak.local.31061:1602221800
|
2
similarities/Makefile
Normal file
2
similarities/Makefile
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
|
||||||
|
start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet
|
BIN
similarities/__pycache__/similarities_helper.cpython-37.pyc
Normal file
BIN
similarities/__pycache__/similarities_helper.cpython-37.pyc
Normal file
Binary file not shown.
73
similarities/cosine_similarities.py
Normal file
73
similarities/cosine_similarities.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
from pyspark.sql import functions as f
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
import pandas as pd
|
||||||
|
import fire
|
||||||
|
from pathlib import Path
|
||||||
|
from similarities_helper import prep_tfidf_entries, read_tfidf_matrix, select_topN_subreddits
|
||||||
|
|
||||||
|
|
||||||
|
def cosine_similarities(infile, term_colname, outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False):
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
conf = spark.sparkContext.getConf()
|
||||||
|
print(outfile)
|
||||||
|
print(exclude_phrases)
|
||||||
|
|
||||||
|
tfidf = spark.read.parquet(infile)
|
||||||
|
|
||||||
|
if included_subreddits is None:
|
||||||
|
included_subreddits = select_topN_subreddits(topN)
|
||||||
|
else:
|
||||||
|
included_subreddits = set(open(included_subreddits))
|
||||||
|
|
||||||
|
if exclude_phrases == True:
|
||||||
|
tfidf = tfidf.filter(~f.col(term_colname).contains("_"))
|
||||||
|
|
||||||
|
print("creating temporary parquet with matrix indicies")
|
||||||
|
tempdir = prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits)
|
||||||
|
tfidf = spark.read.parquet(tempdir.name)
|
||||||
|
subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
|
||||||
|
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||||
|
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||||
|
spark.stop()
|
||||||
|
|
||||||
|
print("loading matrix")
|
||||||
|
mat = read_tfidf_matrix(tempdir.name, term_colname)
|
||||||
|
print('computing similarities')
|
||||||
|
sims = column_similarities(mat)
|
||||||
|
del mat
|
||||||
|
|
||||||
|
sims = pd.DataFrame(sims.todense())
|
||||||
|
sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1)
|
||||||
|
sims['subreddit'] = subreddit_names.subreddit.values
|
||||||
|
|
||||||
|
p = Path(outfile)
|
||||||
|
|
||||||
|
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
||||||
|
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
||||||
|
output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
||||||
|
|
||||||
|
sims.to_feather(outfile)
|
||||||
|
tempdir.cleanup()
|
||||||
|
|
||||||
|
def term_cosine_similarities(outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False):
|
||||||
|
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
|
||||||
|
'term',
|
||||||
|
outfile,
|
||||||
|
min_df,
|
||||||
|
included_subreddits,
|
||||||
|
topN,
|
||||||
|
exclude_phrases)
|
||||||
|
|
||||||
|
def author_cosine_similarities(outfile, min_df=2, included_subreddits=None, topN=10000):
|
||||||
|
return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
|
||||||
|
'author',
|
||||||
|
outfile,
|
||||||
|
min_df,
|
||||||
|
included_subreddits,
|
||||||
|
topN,
|
||||||
|
exclude_phrases=False)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
fire.Fire({'term':term_cosine_similarities,
|
||||||
|
'author':author_cosine_similarities})
|
||||||
|
|
4
similarities/job_script.sh
Executable file
4
similarities/job_script.sh
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
#!/usr/bin/bash
|
||||||
|
start_spark_cluster.sh
|
||||||
|
spark-submit --master spark://$(hostname):18899 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet
|
||||||
|
stop-all.sh
|
@ -8,11 +8,33 @@ import pyarrow.dataset as ds
|
|||||||
from scipy.sparse import csr_matrix
|
from scipy.sparse import csr_matrix
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pathlib
|
||||||
|
|
||||||
class tf_weight(Enum):
|
class tf_weight(Enum):
|
||||||
MaxTF = 1
|
MaxTF = 1
|
||||||
Norm05 = 2
|
Norm05 = 2
|
||||||
|
|
||||||
|
def read_tfidf_matrix_weekly(path, term_colname, week):
|
||||||
|
term = term_colname
|
||||||
|
term_id = term + '_id'
|
||||||
|
term_id_new = term + '_id_new'
|
||||||
|
|
||||||
|
dataset = ds.dataset(path,format='parquet')
|
||||||
|
entries = dataset.to_table(columns=['tf_idf','subreddit_id_new',term_id_new],filter=ds.field('week')==week).to_pandas()
|
||||||
|
return(csr_matrix((entries.tf_idf,(entries[term_id_new]-1, entries.subreddit_id_new-1))))
|
||||||
|
|
||||||
|
def write_weekly_similarities(path, sims, week, names):
|
||||||
|
sims['week'] = week
|
||||||
|
p = pathlib.Path(path)
|
||||||
|
if not p.is_dir():
|
||||||
|
p.mkdir()
|
||||||
|
|
||||||
|
# reformat as a pairwise list
|
||||||
|
sims = sims.melt(id_vars=['subreddit','week'],value_vars=names.subreddit.values)
|
||||||
|
sims.to_parquet(p / week.isoformat())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def read_tfidf_matrix(path,term_colname):
|
def read_tfidf_matrix(path,term_colname):
|
||||||
term = term_colname
|
term = term_colname
|
||||||
term_id = term + '_id'
|
term_id = term + '_id'
|
||||||
@ -29,6 +51,41 @@ def column_similarities(mat):
|
|||||||
return(sims)
|
return(sims)
|
||||||
|
|
||||||
|
|
||||||
|
def prep_tfidf_entries_weekly(tfidf, term_colname, min_df, included_subreddits):
|
||||||
|
term = term_colname
|
||||||
|
term_id = term + '_id'
|
||||||
|
term_id_new = term + '_id_new'
|
||||||
|
|
||||||
|
if min_df is None:
|
||||||
|
min_df = 0.1 * len(included_subreddits)
|
||||||
|
|
||||||
|
tfidf = tfidf.filter(f.col("subreddit").isin(included_subreddits))
|
||||||
|
|
||||||
|
# we might not have the same terms or subreddits each week, so we need to make unique ids for each week.
|
||||||
|
sub_ids = tfidf.select(['subreddit_id','week']).distinct()
|
||||||
|
sub_ids = sub_ids.withColumn("subreddit_id_new",f.row_number().over(Window.partitionBy('week').orderBy("subreddit_id")))
|
||||||
|
tfidf = tfidf.join(sub_ids,['subreddit_id','week'])
|
||||||
|
|
||||||
|
# only use terms in at least min_df included subreddits in a given week
|
||||||
|
new_count = tfidf.groupBy([term_id,'week']).agg(f.count(term_id).alias('new_count'))
|
||||||
|
tfidf = tfidf.join(new_count,[term_id,'week'],how='inner')
|
||||||
|
|
||||||
|
# reset the term ids
|
||||||
|
term_ids = tfidf.select([term_id,'week']).distinct()
|
||||||
|
term_ids = term_ids.withColumn(term_id_new,f.row_number().over(Window.partitionBy('week').orderBy(term_id)))
|
||||||
|
tfidf = tfidf.join(term_ids,[term_id,'week'])
|
||||||
|
|
||||||
|
tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old")
|
||||||
|
tfidf = tfidf.withColumn("tf_idf", (tfidf.relative_tf * tfidf.idf).cast('float'))
|
||||||
|
|
||||||
|
tempdir =TemporaryDirectory(suffix='.parquet',prefix='term_tfidf_entries',dir='.')
|
||||||
|
|
||||||
|
tfidf = tfidf.repartition('week')
|
||||||
|
|
||||||
|
tfidf.write.parquet(tempdir.name,mode='overwrite',compression='snappy')
|
||||||
|
return(tempdir)
|
||||||
|
|
||||||
|
|
||||||
def prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits):
|
def prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits):
|
||||||
term = term_colname
|
term = term_colname
|
||||||
term_id = term + '_id'
|
term_id = term + '_id'
|
||||||
@ -46,7 +103,6 @@ def prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits):
|
|||||||
|
|
||||||
# only use terms in at least min_df included subreddits
|
# only use terms in at least min_df included subreddits
|
||||||
new_count = tfidf.groupBy(term_id).agg(f.count(term_id).alias('new_count'))
|
new_count = tfidf.groupBy(term_id).agg(f.count(term_id).alias('new_count'))
|
||||||
# new_count = new_count.filter(f.col('new_count') >= min_df)
|
|
||||||
tfidf = tfidf.join(new_count,term_id,how='inner')
|
tfidf = tfidf.join(new_count,term_id,how='inner')
|
||||||
|
|
||||||
# reset the term ids
|
# reset the term ids
|
||||||
@ -55,8 +111,6 @@ def prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits):
|
|||||||
tfidf = tfidf.join(term_ids,term_id)
|
tfidf = tfidf.join(term_ids,term_id)
|
||||||
|
|
||||||
tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old")
|
tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old")
|
||||||
# tfidf = tfidf.withColumnRenamed("idf","idf_old")
|
|
||||||
# tfidf = tfidf.withColumn("idf",f.log(25000/f.col("count")))
|
|
||||||
tfidf = tfidf.withColumn("tf_idf", (tfidf.relative_tf * tfidf.idf).cast('float'))
|
tfidf = tfidf.withColumn("tf_idf", (tfidf.relative_tf * tfidf.idf).cast('float'))
|
||||||
|
|
||||||
tempdir =TemporaryDirectory(suffix='.parquet',prefix='term_tfidf_entries',dir='.')
|
tempdir =TemporaryDirectory(suffix='.parquet',prefix='term_tfidf_entries',dir='.')
|
||||||
@ -64,7 +118,9 @@ def prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits):
|
|||||||
tfidf.write.parquet(tempdir.name,mode='overwrite',compression='snappy')
|
tfidf.write.parquet(tempdir.name,mode='overwrite',compression='snappy')
|
||||||
return tempdir
|
return tempdir
|
||||||
|
|
||||||
def cosine_similarities(tfidf, term_colname, min_df, included_subreddits, similarity_threshold):
|
|
||||||
|
# try computing cosine similarities using spark
|
||||||
|
def spark_cosine_similarities(tfidf, term_colname, min_df, included_subreddits, similarity_threshold):
|
||||||
term = term_colname
|
term = term_colname
|
||||||
term_id = term + '_id'
|
term_id = term + '_id'
|
||||||
term_id_new = term + '_id_new'
|
term_id_new = term + '_id_new'
|
||||||
@ -82,7 +138,6 @@ def cosine_similarities(tfidf, term_colname, min_df, included_subreddits, simila
|
|||||||
|
|
||||||
# only use terms in at least min_df included subreddits
|
# only use terms in at least min_df included subreddits
|
||||||
new_count = tfidf.groupBy(term_id).agg(f.count(term_id).alias('new_count'))
|
new_count = tfidf.groupBy(term_id).agg(f.count(term_id).alias('new_count'))
|
||||||
# new_count = new_count.filter(f.col('new_count') >= min_df)
|
|
||||||
tfidf = tfidf.join(new_count,term_id,how='inner')
|
tfidf = tfidf.join(new_count,term_id,how='inner')
|
||||||
|
|
||||||
# reset the term ids
|
# reset the term ids
|
||||||
@ -91,14 +146,10 @@ def cosine_similarities(tfidf, term_colname, min_df, included_subreddits, simila
|
|||||||
tfidf = tfidf.join(term_ids,term_id)
|
tfidf = tfidf.join(term_ids,term_id)
|
||||||
|
|
||||||
tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old")
|
tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old")
|
||||||
# tfidf = tfidf.withColumnRenamed("idf","idf_old")
|
|
||||||
# tfidf = tfidf.withColumn("idf",f.log(25000/f.col("count")))
|
|
||||||
tfidf = tfidf.withColumn("tf_idf", tfidf.relative_tf * tfidf.idf)
|
tfidf = tfidf.withColumn("tf_idf", tfidf.relative_tf * tfidf.idf)
|
||||||
|
|
||||||
# step 1 make an rdd of entires
|
# step 1 make an rdd of entires
|
||||||
# sorted by (dense) spark subreddit id
|
# sorted by (dense) spark subreddit id
|
||||||
# entries = tfidf.filter((f.col('subreddit') == 'asoiaf') | (f.col('subreddit') == 'gameofthrones') | (f.col('subreddit') == 'christianity')).select(f.col("term_id_new")-1,f.col("subreddit_id_new")-1,"tf_idf").rdd
|
|
||||||
|
|
||||||
n_partitions = int(len(included_subreddits)*2 / 5)
|
n_partitions = int(len(included_subreddits)*2 / 5)
|
||||||
|
|
||||||
entries = tfidf.select(f.col(term_id_new)-1,f.col("subreddit_id_new")-1,"tf_idf").rdd.repartition(n_partitions)
|
entries = tfidf.select(f.col(term_id_new)-1,f.col("subreddit_id_new")-1,"tf_idf").rdd.repartition(n_partitions)
|
||||||
@ -214,7 +265,6 @@ def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm
|
|||||||
df = df.join(idf, on=[term_id, term])
|
df = df.join(idf, on=[term_id, term])
|
||||||
|
|
||||||
# agg terms by subreddit to make sparse tf/df vectors
|
# agg terms by subreddit to make sparse tf/df vectors
|
||||||
|
|
||||||
if tf_family == tf_weight.MaxTF:
|
if tf_family == tf_weight.MaxTF:
|
||||||
df = df.withColumn("tf_idf", df.relative_tf * df.idf)
|
df = df.withColumn("tf_idf", df.relative_tf * df.idf)
|
||||||
else: # tf_fam = tf_weight.Norm05
|
else: # tf_fam = tf_weight.Norm05
|
||||||
@ -222,4 +272,7 @@ def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm
|
|||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv"):
|
||||||
|
rankdf = pd.read_csv(path)
|
||||||
|
included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
|
||||||
|
return included_subreddits
|
73
similarities/tfidf.py
Normal file
73
similarities/tfidf.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
import fire
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from pyspark.sql import functions as f
|
||||||
|
from similarities_helper import build_tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits
|
||||||
|
|
||||||
|
|
||||||
|
def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude):
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
|
||||||
|
df = spark.read.parquet(inpath)
|
||||||
|
|
||||||
|
df = df.filter(~ f.col(term_colname).isin(exclude))
|
||||||
|
|
||||||
|
include_subs = select_topN_subreddits(topN)
|
||||||
|
|
||||||
|
df = func(df, include_subs, term_colname)
|
||||||
|
|
||||||
|
df.write.parquet(outpath,mode='overwrite',compression='snappy')
|
||||||
|
|
||||||
|
spark.stop()
|
||||||
|
|
||||||
|
def tfidf(inpath, outpath, topN, term_colname, exclude):
|
||||||
|
return _tfidf_wrapper(build_tfidf_dataset, inpath, outpath, topN, term_colname, exclude)
|
||||||
|
|
||||||
|
def tfidf_weekly(inpath, outpath, topN, term_colname, exclude):
|
||||||
|
return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude)
|
||||||
|
|
||||||
|
def tfidf_authors(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
|
||||||
|
topN=25000):
|
||||||
|
|
||||||
|
return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
|
||||||
|
outpath,
|
||||||
|
topN,
|
||||||
|
'author',
|
||||||
|
['[deleted]','AutoModerator']
|
||||||
|
)
|
||||||
|
|
||||||
|
def tfidf_terms(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
|
||||||
|
topN=25000):
|
||||||
|
|
||||||
|
return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
|
||||||
|
outpath,
|
||||||
|
topN,
|
||||||
|
'term',
|
||||||
|
[]
|
||||||
|
)
|
||||||
|
|
||||||
|
def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
|
||||||
|
topN=25000):
|
||||||
|
|
||||||
|
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
|
||||||
|
outpath,
|
||||||
|
topN,
|
||||||
|
'author',
|
||||||
|
['[deleted]','AutoModerator']
|
||||||
|
)
|
||||||
|
|
||||||
|
def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
|
||||||
|
topN=25000):
|
||||||
|
|
||||||
|
return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
|
||||||
|
outpath,
|
||||||
|
topN,
|
||||||
|
'term',
|
||||||
|
[]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
fire.Fire({'authors':tfidf_authors,
|
||||||
|
'terms':tfidf_terms,
|
||||||
|
'authors_weekly':tfidf_authors_weekly,
|
||||||
|
'terms_weekly':tfidf_terms_weekly})
|
@ -1,14 +1,6 @@
|
|||||||
from pyspark.sql import functions as f
|
from pyspark.sql import functions as f
|
||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
from pyspark.sql import Window
|
from pyspark.sql import Window
|
||||||
from pyspark.mllib.linalg.distributed import RowMatrix, CoordinateMatrix
|
|
||||||
import numpy as np
|
|
||||||
import pyarrow
|
|
||||||
import pandas as pd
|
|
||||||
import fire
|
|
||||||
from itertools import islice
|
|
||||||
from pathlib import Path
|
|
||||||
from similarities_helper import cosine_similarities
|
|
||||||
|
|
||||||
spark = SparkSession.builder.getOrCreate()
|
spark = SparkSession.builder.getOrCreate()
|
||||||
conf = spark.sparkContext.getConf()
|
conf = spark.sparkContext.getConf()
|
||||||
@ -21,10 +13,10 @@ df = df.filter(~df.subreddit.like("u_%"))
|
|||||||
df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments"))
|
df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments"))
|
||||||
|
|
||||||
win = Window.orderBy(f.col('n_comments').desc())
|
win = Window.orderBy(f.col('n_comments').desc())
|
||||||
df = df.withColumn('comments_rank',f.rank().over(win))
|
df = df.withColumn('comments_rank', f.rank().over(win))
|
||||||
|
|
||||||
df = df.toPandas()
|
df = df.toPandas()
|
||||||
|
|
||||||
df = df.sort_values("n_comments")
|
df = df.sort_values("n_comments")
|
||||||
|
|
||||||
df.to_csv('/gscratch/comdata/users/nathante/cdsc-reddit/subreddits_by_num_comments.csv',index=False)
|
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False)
|
73
similarities/weekly_cosine_similarities.py
Normal file
73
similarities/weekly_cosine_similarities.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
from pyspark.sql import functions as f
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from pyspark.sql import Window
|
||||||
|
import numpy as np
|
||||||
|
import pyarrow
|
||||||
|
import pandas as pd
|
||||||
|
import fire
|
||||||
|
from itertools import islice
|
||||||
|
from pathlib import Path
|
||||||
|
from similarities_helper import *
|
||||||
|
|
||||||
|
|
||||||
|
#tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_weekly.parquet')
|
||||||
|
def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, included_subreddits = None, topN = 500):
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
conf = spark.sparkContext.getConf()
|
||||||
|
print(outfile)
|
||||||
|
tfidf = spark.read.parquet(tfidf_path)
|
||||||
|
|
||||||
|
if included_subreddits is None:
|
||||||
|
included_subreddits = select_topN_subreddits(topN)
|
||||||
|
else:
|
||||||
|
included_subreddits = set(open(included_subreddits))
|
||||||
|
|
||||||
|
print(f"computing weekly similarities for {len(included_subreddits)} subreddits")
|
||||||
|
|
||||||
|
print("creating temporary parquet with matrix indicies")
|
||||||
|
tempdir = prep_tfidf_entries_weekly(tfidf, term_colname, min_df, included_subreddits)
|
||||||
|
|
||||||
|
tfidf = spark.read.parquet(tempdir.name)
|
||||||
|
|
||||||
|
# the ids can change each week.
|
||||||
|
subreddit_names = tfidf.select(['subreddit','subreddit_id_new','week']).distinct().toPandas()
|
||||||
|
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
||||||
|
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
||||||
|
spark.stop()
|
||||||
|
|
||||||
|
weeks = list(subreddit_names.week.drop_duplicates())
|
||||||
|
for week in weeks:
|
||||||
|
print(f"loading matrix: {week}")
|
||||||
|
mat = read_tfidf_matrix_weekly(tempdir.name, term_colname, week)
|
||||||
|
print('computing similarities')
|
||||||
|
sims = column_similarities(mat)
|
||||||
|
del mat
|
||||||
|
|
||||||
|
names = subreddit_names.loc[subreddit_names.week == week]
|
||||||
|
sims = pd.DataFrame(sims.todense())
|
||||||
|
|
||||||
|
sims = sims.rename({i: sr for i, sr in enumerate(names.subreddit.values)}, axis=1)
|
||||||
|
sims['subreddit'] = names.subreddit.values
|
||||||
|
|
||||||
|
write_weekly_similarities(outfile, sims, week, names)
|
||||||
|
|
||||||
|
|
||||||
|
def author_cosine_similarities_weekly(outfile, min_df=None , included_subreddits=None, topN=500):
|
||||||
|
return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
|
||||||
|
outfile,
|
||||||
|
'author',
|
||||||
|
min_df,
|
||||||
|
included_subreddits,
|
||||||
|
topN)
|
||||||
|
|
||||||
|
def term_cosine_similarities_weekly(outfile, min_df=None, included_subreddits=None, topN=500):
|
||||||
|
return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
|
||||||
|
outfile,
|
||||||
|
'term',
|
||||||
|
min_df,
|
||||||
|
included_subreddits,
|
||||||
|
topN)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
fire.Fire({'author':author_cosine_similarities_weekly,
|
||||||
|
'term':term_cosine_similarities_weekly})
|
@ -1,127 +0,0 @@
|
|||||||
from pyspark.sql import functions as f
|
|
||||||
from pyspark.sql import SparkSession
|
|
||||||
from pyspark.sql import Window
|
|
||||||
from pyspark.mllib.linalg.distributed import RowMatrix, CoordinateMatrix
|
|
||||||
import numpy as np
|
|
||||||
import pyarrow
|
|
||||||
import pandas as pd
|
|
||||||
import fire
|
|
||||||
from itertools import islice
|
|
||||||
from pathlib import Path
|
|
||||||
from similarities_helper import cosine_similarities, prep_tfidf_entries, read_tfidf_matrix, column_similarities
|
|
||||||
import scipy
|
|
||||||
# outfile='test_similarities_500.feather';
|
|
||||||
# min_df = None;
|
|
||||||
# included_subreddits=None; topN=100; exclude_phrases=True;
|
|
||||||
|
|
||||||
def term_cosine_similarities(outfile, min_df = None, included_subreddits=None, topN=500, exclude_phrases=False):
|
|
||||||
spark = SparkSession.builder.getOrCreate()
|
|
||||||
conf = spark.sparkContext.getConf()
|
|
||||||
print(outfile)
|
|
||||||
print(exclude_phrases)
|
|
||||||
|
|
||||||
tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf.parquet')
|
|
||||||
|
|
||||||
if included_subreddits is None:
|
|
||||||
rankdf = pd.read_csv("/gscratch/comdata/users/nathante/cdsc-reddit/subreddits_by_num_comments.csv")
|
|
||||||
included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
|
|
||||||
|
|
||||||
else:
|
|
||||||
included_subreddits = set(open(included_subreddits))
|
|
||||||
|
|
||||||
if exclude_phrases == True:
|
|
||||||
tfidf = tfidf.filter(~f.col(term).contains("_"))
|
|
||||||
|
|
||||||
print("creating temporary parquet with matrix indicies")
|
|
||||||
tempdir = prep_tfidf_entries(tfidf, 'term', min_df, included_subreddits)
|
|
||||||
tfidf = spark.read.parquet(tempdir.name)
|
|
||||||
subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
|
|
||||||
subreddit_names = subreddit_names.sort_values("subreddit_id_new")
|
|
||||||
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
|
|
||||||
spark.stop()
|
|
||||||
|
|
||||||
print("loading matrix")
|
|
||||||
mat = read_tfidf_matrix(tempdir.name,'term')
|
|
||||||
print('computing similarities')
|
|
||||||
sims = column_similarities(mat)
|
|
||||||
del mat
|
|
||||||
|
|
||||||
sims = pd.DataFrame(sims.todense())
|
|
||||||
sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)},axis=1)
|
|
||||||
sims['subreddit'] = subreddit_names.subreddit.values
|
|
||||||
|
|
||||||
p = Path(outfile)
|
|
||||||
|
|
||||||
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
|
||||||
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
|
||||||
output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
|
||||||
|
|
||||||
sims.to_feather(outfile)
|
|
||||||
tempdir.cleanup()
|
|
||||||
path = "term_tfidf_entriesaukjy5gv.parquet"
|
|
||||||
|
|
||||||
|
|
||||||
# outfile = '/gscratch/comdata/users/nathante/test_similarities_500.feather'; min_df = None; included_subreddits=None; similarity_threshold=0;
|
|
||||||
# def term_cosine_similarities(outfile, min_df = None, included_subreddits=None, similarity_threshold=0, topN=500, exclude_phrases=True):
|
|
||||||
# '''
|
|
||||||
# Compute similarities between subreddits based on tfi-idf vectors of comment texts
|
|
||||||
|
|
||||||
# included_subreddits : string
|
|
||||||
# Text file containing a list of subreddits to include (one per line) if included_subreddits is None then do the top 500 subreddits
|
|
||||||
|
|
||||||
# similarity_threshold : double (default = 0)
|
|
||||||
# set > 0 for large numbers of subreddits to get an approximate solution using the DIMSUM algorithm
|
|
||||||
# https://stanford.edu/~rezab/papers/dimsum.pdf. If similarity_threshold=0 we get an exact solution using an O(N^2) algorithm.
|
|
||||||
|
|
||||||
# min_df : int (default = 0.1 * (number of included_subreddits)
|
|
||||||
# exclude terms that appear in fewer than this number of documents.
|
|
||||||
|
|
||||||
# outfile: string
|
|
||||||
# where to output csv and feather outputs
|
|
||||||
# '''
|
|
||||||
|
|
||||||
# print(outfile)
|
|
||||||
# print(exclude_phrases)
|
|
||||||
|
|
||||||
# tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf.parquet')
|
|
||||||
|
|
||||||
# if included_subreddits is None:
|
|
||||||
# included_subreddits = list(islice(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"),topN))
|
|
||||||
# included_subreddits = {s.strip('\n') for s in included_subreddits}
|
|
||||||
|
|
||||||
# else:
|
|
||||||
# included_subreddits = set(open(included_subreddits))
|
|
||||||
|
|
||||||
# if exclude_phrases == True:
|
|
||||||
# tfidf = tfidf.filter(~f.col(term).contains("_"))
|
|
||||||
|
|
||||||
# sim_dist, tfidf = cosine_similarities(tfidf, 'term', min_df, included_subreddits, similarity_threshold)
|
|
||||||
|
|
||||||
# p = Path(outfile)
|
|
||||||
|
|
||||||
# output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
|
||||||
# output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
|
||||||
# output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
|
|
||||||
|
|
||||||
# sim_dist.entries.toDF().write.parquet(str(output_parquet),mode='overwrite',compression='snappy')
|
|
||||||
|
|
||||||
# #instead of toLocalMatrix() why not read as entries and put strait into numpy
|
|
||||||
# sim_entries = pd.read_parquet(output_parquet)
|
|
||||||
|
|
||||||
# df = tfidf.select('subreddit','subreddit_id_new').distinct().toPandas()
|
|
||||||
# spark.stop()
|
|
||||||
# df['subreddit_id_new'] = df['subreddit_id_new'] - 1
|
|
||||||
# df = df.sort_values('subreddit_id_new').reset_index(drop=True)
|
|
||||||
# df = df.set_index('subreddit_id_new')
|
|
||||||
|
|
||||||
# similarities = sim_entries.join(df, on='i')
|
|
||||||
# similarities = similarities.rename(columns={'subreddit':"subreddit_i"})
|
|
||||||
# similarities = similarities.join(df, on='j')
|
|
||||||
# similarities = similarities.rename(columns={'subreddit':"subreddit_j"})
|
|
||||||
|
|
||||||
# similarities.to_feather(output_feather)
|
|
||||||
# similarities.to_csv(output_csv)
|
|
||||||
# return similarities
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
fire.Fire(term_cosine_similarities)
|
|
@ -1,21 +0,0 @@
|
|||||||
from pyspark.sql import SparkSession
|
|
||||||
from similarities_helper import build_tfidf_dataset
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
spark = SparkSession.builder.getOrCreate()
|
|
||||||
|
|
||||||
df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp")
|
|
||||||
|
|
||||||
include_subs = pd.read_csv("/gscratch/comdata/users/nathante/cdsc-reddit/subreddits_by_num_comments.csv")
|
|
||||||
|
|
||||||
#include_subs = set(include_subs.loc[include_subs.comments_rank < 300]['subreddit'])
|
|
||||||
|
|
||||||
# remove [deleted] and AutoModerator (TODO remove other bots)
|
|
||||||
df = df.filter(df.author != '[deleted]')
|
|
||||||
df = df.filter(df.author != 'AutoModerator')
|
|
||||||
|
|
||||||
df = build_tfidf_dataset(df, include_subs, 'author')
|
|
||||||
|
|
||||||
df.write.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_authors.parquet',mode='overwrite',compression='snappy')
|
|
||||||
|
|
||||||
spark.stop()
|
|
@ -1,18 +0,0 @@
|
|||||||
from pyspark.sql import functions as f
|
|
||||||
from pyspark.sql import SparkSession
|
|
||||||
from pyspark.sql import Window
|
|
||||||
from similarities_helper import build_tfidf_dataset
|
|
||||||
|
|
||||||
## TODO:need to exclude automoderator / bot posts.
|
|
||||||
## TODO:need to exclude better handle hyperlinks.
|
|
||||||
|
|
||||||
spark = SparkSession.builder.getOrCreate()
|
|
||||||
df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test.parquet_temp")
|
|
||||||
|
|
||||||
include_subs = set(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"))
|
|
||||||
include_subs = {s.strip('\n') for s in include_subs}
|
|
||||||
|
|
||||||
df = build_tfidf_dataset(df, include_subs, 'term')
|
|
||||||
|
|
||||||
df.write.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf.parquet',mode='overwrite',compression='snappy')
|
|
||||||
spark.stop()
|
|
Loading…
Reference in New Issue
Block a user