git-annex in nathante@mox2.hyak.local:/gscratch/comdata/users/nathante/cdsc-reddit
This commit is contained in:
parent
4447c60265
commit
6baa08889b
@ -13,7 +13,7 @@ spark = SparkSession.builder.getOrCreate()
|
|||||||
conf = spark.sparkContext.getConf()
|
conf = spark.sparkContext.getConf()
|
||||||
|
|
||||||
# outfile = '/gscratch/comdata/users/nathante/test_similarities_500.feather'; min_df = None; included_subreddits=None; similarity_threshold=0;
|
# outfile = '/gscratch/comdata/users/nathante/test_similarities_500.feather'; min_df = None; included_subreddits=None; similarity_threshold=0;
|
||||||
def author_cosine_similarities(outfile, min_df = None, included_subreddits=None, similarity_threshold=0, topN=500, exclude_phrases=True):
|
def author_cosine_similarities(outfile, min_df = None, included_subreddits=None, similarity_threshold=0, topN=500):
|
||||||
'''
|
'''
|
||||||
Compute similarities between subreddits based on tfi-idf vectors of author comments
|
Compute similarities between subreddits based on tfi-idf vectors of author comments
|
||||||
|
|
||||||
@ -32,9 +32,8 @@ https://stanford.edu/~rezab/papers/dimsum.pdf. If similarity_threshold=0 we get
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
print(outfile)
|
print(outfile)
|
||||||
print(exclude_phrases)
|
|
||||||
|
|
||||||
tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_authors.parquet_test1/part-00000-107cee94-92d8-4265-b804-40f1e7f1aaf2-c000.snappy.parquet')
|
tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_authors.parquet')
|
||||||
|
|
||||||
if included_subreddits is None:
|
if included_subreddits is None:
|
||||||
included_subreddits = list(islice(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"),topN))
|
included_subreddits = list(islice(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"),topN))
|
||||||
@ -55,12 +54,14 @@ https://stanford.edu/~rezab/papers/dimsum.pdf. If similarity_threshold=0 we get
|
|||||||
sim_dist = sim_dist.repartition(1)
|
sim_dist = sim_dist.repartition(1)
|
||||||
sim_dist.write.parquet(str(output_parquet),mode='overwrite',compression='snappy')
|
sim_dist.write.parquet(str(output_parquet),mode='overwrite',compression='snappy')
|
||||||
|
|
||||||
spark.stop()
|
|
||||||
|
|
||||||
#instead of toLocalMatrix() why not read as entries and put strait into numpy
|
#instead of toLocalMatrix() why not read as entries and put strait into numpy
|
||||||
sim_entries = pd.read_parquet(output_parquet)
|
sim_entries = pd.read_parquet(output_parquet)
|
||||||
|
|
||||||
df = tfidf.select('subreddit','subreddit_id_new').distinct().toPandas()
|
df = tfidf.select('subreddit','subreddit_id_new').distinct().toPandas()
|
||||||
|
|
||||||
|
spark.stop()
|
||||||
df['subreddit_id_new'] = df['subreddit_id_new'] - 1
|
df['subreddit_id_new'] = df['subreddit_id_new'] - 1
|
||||||
df = df.sort_values('subreddit_id_new').reset_index(drop=True)
|
df = df.sort_values('subreddit_id_new').reset_index(drop=True)
|
||||||
df = df.set_index('subreddit_id_new')
|
df = df.set_index('subreddit_id_new')
|
||||||
@ -75,4 +76,4 @@ https://stanford.edu/~rezab/papers/dimsum.pdf. If similarity_threshold=0 we get
|
|||||||
return similarities
|
return similarities
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
fire.Fire(term_cosine_similarities)
|
fire.Fire(author_cosine_similarities)
|
||||||
|
@ -13,8 +13,12 @@
|
|||||||
#SBATCH --mem=32G
|
#SBATCH --mem=32G
|
||||||
#SBATCH --cpus-per-task=4
|
#SBATCH --cpus-per-task=4
|
||||||
#SBATCH --ntasks=1
|
#SBATCH --ntasks=1
|
||||||
|
#SBATCH -D /gscratch/comdata/users/nathante/cdsc-reddit
|
||||||
|
source ./bin/activate
|
||||||
module load parallel_sql
|
module load parallel_sql
|
||||||
|
echo $(which perl)
|
||||||
|
conda list pyarrow
|
||||||
|
which python3
|
||||||
#Put here commands to load other modules (e.g. matlab etc.)
|
#Put here commands to load other modules (e.g. matlab etc.)
|
||||||
#Below command means that parallel_sql will get tasks from the database
|
#Below command means that parallel_sql will get tasks from the database
|
||||||
#and run them on the node (in parallel). So a 16 core node will have
|
#and run them on the node (in parallel). So a 16 core node will have
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
echo "#!/usr/bin/bash" > job_script.sh
|
echo "#!/usr/bin/bash" > job_script.sh
|
||||||
echo "source $(pwd)/../bin/activate" >> job_script.sh
|
#echo "source $(pwd)/../bin/activate" >> job_script.sh
|
||||||
echo "python3 $(pwd)/comments_2_parquet_part1.py" >> job_script.sh
|
echo "python3 $(pwd)/comments_2_parquet_part1.py" >> job_script.sh
|
||||||
|
|
||||||
srun -p comdata -A comdata --nodes=1 --mem=120G --time=48:00:00 --pty job_script.sh
|
srun -p comdata -A comdata --nodes=1 --mem=120G --time=48:00:00 --pty job_script.sh
|
||||||
|
@ -8,8 +8,6 @@ import pandas as pd
|
|||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pyarrow.parquet as pq
|
import pyarrow.parquet as pq
|
||||||
|
|
||||||
globstr_base = "/gscratch/comdata/reddit_dumps/comments/RC_20*"
|
|
||||||
|
|
||||||
def parse_comment(comment, names= None):
|
def parse_comment(comment, names= None):
|
||||||
if names is None:
|
if names is None:
|
||||||
names = ["id","subreddit","link_id","parent_id","created_utc","author","ups","downs","score","edited","subreddit_type","subreddit_id","stickied","is_submitter","body","error"]
|
names = ["id","subreddit","link_id","parent_id","created_utc","author","ups","downs","score","edited","subreddit_type","subreddit_id","stickied","is_submitter","body","error"]
|
||||||
@ -48,15 +46,15 @@ def parse_comment(comment, names= None):
|
|||||||
|
|
||||||
# conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','/gscratch/comdata/spark_tmp')])
|
# conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','/gscratch/comdata/spark_tmp')])
|
||||||
|
|
||||||
dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments"
|
dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments/"
|
||||||
|
|
||||||
files = list(find_dumps(dumpdir, base_pattern="RC_20*.*"))
|
files = list(find_dumps(dumpdir, base_pattern="RC_20*"))
|
||||||
|
|
||||||
pool = Pool(28)
|
pool = Pool(28)
|
||||||
|
|
||||||
stream = open_fileset(files)
|
stream = open_fileset(files)
|
||||||
|
|
||||||
N = 100000
|
N = int(1e4)
|
||||||
|
|
||||||
rows = pool.imap_unordered(parse_comment, stream, chunksize=int(N/28))
|
rows = pool.imap_unordered(parse_comment, stream, chunksize=int(N/28))
|
||||||
|
|
||||||
@ -80,8 +78,33 @@ schema = pa.schema([
|
|||||||
pa.field('error', pa.string(), nullable=True),
|
pa.field('error', pa.string(), nullable=True),
|
||||||
])
|
])
|
||||||
|
|
||||||
with pq.ParquetWriter("/gscratch/comdata/output/reddit_comments.parquet_temp",schema=schema,compression='snappy',flavor='spark') as writer:
|
from pathlib import Path
|
||||||
while True:
|
p = Path("/gscratch/comdata/output/reddit_comments.parquet_temp2")
|
||||||
|
|
||||||
|
if not p.is_dir():
|
||||||
|
if p.exists():
|
||||||
|
p.unlink()
|
||||||
|
p.mkdir()
|
||||||
|
|
||||||
|
else:
|
||||||
|
list(map(Path.unlink,p.glob('*')))
|
||||||
|
|
||||||
|
part_size = int(1e7)
|
||||||
|
part = 1
|
||||||
|
n_output = 0
|
||||||
|
writer = pq.ParquetWriter(f"/gscratch/comdata/output/reddit_comments.parquet_temp2/part_{part}.parquet",schema=schema,compression='snappy',flavor='spark')
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if n_output > part_size:
|
||||||
|
if part > 1:
|
||||||
|
writer.close()
|
||||||
|
|
||||||
|
part = part + 1
|
||||||
|
n_output = 0
|
||||||
|
|
||||||
|
writer = pq.ParquetWriter(f"/gscratch/comdata/output/reddit_comments.parquet_temp2/part_{part}.parquet",schema=schema,compression='snappy',flavor='spark')
|
||||||
|
|
||||||
|
n_output += N
|
||||||
chunk = islice(rows,N)
|
chunk = islice(rows,N)
|
||||||
pddf = pd.DataFrame(chunk, columns=schema.names)
|
pddf = pd.DataFrame(chunk, columns=schema.names)
|
||||||
table = pa.Table.from_pandas(pddf,schema=schema)
|
table = pa.Table.from_pandas(pddf,schema=schema)
|
||||||
@ -89,4 +112,4 @@ with pq.ParquetWriter("/gscratch/comdata/output/reddit_comments.parquet_temp",sc
|
|||||||
break
|
break
|
||||||
writer.write_table(table)
|
writer.write_table(table)
|
||||||
|
|
||||||
writer.close()
|
|
||||||
|
@ -7,7 +7,7 @@ from pyspark.sql import SparkSession
|
|||||||
|
|
||||||
spark = SparkSession.builder.getOrCreate()
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
|
||||||
df = spark.read.parquet("/gscratch/comdata/output/reddit_comments.parquet_temp2")
|
df = spark.read.parquet("/gscratch/comdata/output/reddit_comments.parquet_temp2",compression='snappy')
|
||||||
|
|
||||||
df = df.withColumn("subreddit_2", f.lower(f.col('subreddit')))
|
df = df.withColumn("subreddit_2", f.lower(f.col('subreddit')))
|
||||||
df = df.drop('subreddit')
|
df = df.drop('subreddit')
|
||||||
@ -21,9 +21,9 @@ df = df.withColumn("Day",f.dayofmonth(f.col("CreatedAt")))
|
|||||||
df = df.repartition('subreddit')
|
df = df.repartition('subreddit')
|
||||||
df2 = df.sort(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True)
|
df2 = df.sort(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True)
|
||||||
df2 = df2.sortWithinPartitions(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True)
|
df2 = df2.sortWithinPartitions(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True)
|
||||||
df2.write.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet", mode='overwrite', compression='snappy')
|
df2.write.parquet("/gscratch/comdata/users/nathante/reddit_comments_by_subreddit.parquet_new", mode='overwrite', compression='snappy')
|
||||||
|
|
||||||
df = df.repartition('author')
|
df = df.repartition('author')
|
||||||
df3 = df.sort(["author","CreatedAt","subreddit","link_id","parent_id","Year","Month","Day"],ascending=True)
|
df3 = df.sort(["author","CreatedAt","subreddit","link_id","parent_id","Year","Month","Day"],ascending=True)
|
||||||
df3 = df3.sortWithinPartitions(["author","CreatedAt","subreddit","link_id","parent_id","Year","Month","Day"],ascending=True)
|
df3 = df3.sortWithinPartitions(["author","CreatedAt","subreddit","link_id","parent_id","Year","Month","Day"],ascending=True)
|
||||||
df3.write.parquet("/gscratch/comdata/output/reddit_comments_by_author.parquet", mode='overwrite',compression='snappy')
|
df3.write.parquet("/gscratch/comdata/users/nathante/reddit_comments_by_author.parquet_new", mode='overwrite',compression='snappy')
|
||||||
|
@ -14,7 +14,7 @@ def find_dumps(dumpdir, base_pattern):
|
|||||||
fname, ext = path.splitext(fpath)
|
fname, ext = path.splitext(fpath)
|
||||||
dumpext[fname].append(ext)
|
dumpext[fname].append(ext)
|
||||||
|
|
||||||
ext_priority = ['.zst','.xz','.bz2']
|
ext_priority = ['.zst','.xz','.bz2','.gz']
|
||||||
|
|
||||||
for base, exts in dumpext.items():
|
for base, exts in dumpext.items():
|
||||||
ext = [ext for ext in ext_priority if ext in exts][0]
|
ext = [ext for ext in ext_priority if ext in exts][0]
|
||||||
|
@ -8,7 +8,7 @@ import pandas as pd
|
|||||||
import fire
|
import fire
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from similarities_helper import build_cosine_similarities
|
from similarities_helper import cosine_similarities
|
||||||
|
|
||||||
spark = SparkSession.builder.getOrCreate()
|
spark = SparkSession.builder.getOrCreate()
|
||||||
conf = spark.sparkContext.getConf()
|
conf = spark.sparkContext.getConf()
|
||||||
@ -57,12 +57,11 @@ https://stanford.edu/~rezab/papers/dimsum.pdf. If similarity_threshold=0 we get
|
|||||||
|
|
||||||
sim_dist.entries.toDF().write.parquet(str(output_parquet),mode='overwrite',compression='snappy')
|
sim_dist.entries.toDF().write.parquet(str(output_parquet),mode='overwrite',compression='snappy')
|
||||||
|
|
||||||
spark.stop()
|
|
||||||
|
|
||||||
#instead of toLocalMatrix() why not read as entries and put strait into numpy
|
#instead of toLocalMatrix() why not read as entries and put strait into numpy
|
||||||
sim_entries = pd.read_parquet(output_parquet)
|
sim_entries = pd.read_parquet(output_parquet)
|
||||||
|
|
||||||
df = tfidf.select('subreddit','subreddit_id_new').distinct().toPandas()
|
df = tfidf.select('subreddit','subreddit_id_new').distinct().toPandas()
|
||||||
|
spark.stop()
|
||||||
df['subreddit_id_new'] = df['subreddit_id_new'] - 1
|
df['subreddit_id_new'] = df['subreddit_id_new'] - 1
|
||||||
df = df.sort_values('subreddit_id_new').reset_index(drop=True)
|
df = df.sort_values('subreddit_id_new').reset_index(drop=True)
|
||||||
df = df.set_index('subreddit_id_new')
|
df = df.set_index('subreddit_id_new')
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
import pandas as pd
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pyarrow.dataset as ds
|
import pyarrow.dataset as ds
|
||||||
import pyarrow.parquet as pq
|
import pyarrow.parquet as pq
|
||||||
from itertools import groupby, islice, chain
|
from itertools import groupby, islice, chain
|
||||||
import fire
|
import fire
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import pandas as pd
|
|
||||||
import os
|
import os
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
@ -22,7 +22,6 @@ urlregex = re.compile(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-
|
|||||||
# compute term frequencies for comments in each subreddit by week
|
# compute term frequencies for comments in each subreddit by week
|
||||||
def weekly_tf(partition, mwe_pass = 'first'):
|
def weekly_tf(partition, mwe_pass = 'first'):
|
||||||
dataset = ds.dataset(f'/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/{partition}', format='parquet')
|
dataset = ds.dataset(f'/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/{partition}', format='parquet')
|
||||||
|
|
||||||
if not os.path.exists("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/"):
|
if not os.path.exists("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/"):
|
||||||
os.mkdir("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/")
|
os.mkdir("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/")
|
||||||
|
|
||||||
@ -31,6 +30,7 @@ def weekly_tf(partition, mwe_pass = 'first'):
|
|||||||
|
|
||||||
ngram_output = partition.replace("parquet","txt")
|
ngram_output = partition.replace("parquet","txt")
|
||||||
|
|
||||||
|
if mwe_pass == 'first':
|
||||||
if os.path.exists(f"/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/{ngram_output}"):
|
if os.path.exists(f"/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/{ngram_output}"):
|
||||||
os.remove(f"/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/{ngram_output}")
|
os.remove(f"/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/{ngram_output}")
|
||||||
|
|
||||||
@ -167,13 +167,19 @@ def weekly_tf(partition, mwe_pass = 'first'):
|
|||||||
pddf = pddf.loc[pddf.is_token == True, schema.names]
|
pddf = pddf.loc[pddf.is_token == True, schema.names]
|
||||||
author_pddf = author_pddf.rename({'term':'author'}, axis='columns')
|
author_pddf = author_pddf.rename({'term':'author'}, axis='columns')
|
||||||
author_pddf = author_pddf.loc[:,author_schema.names]
|
author_pddf = author_pddf.loc[:,author_schema.names]
|
||||||
|
|
||||||
table = pa.Table.from_pandas(pddf,schema=schema)
|
table = pa.Table.from_pandas(pddf,schema=schema)
|
||||||
author_table = pa.Table.from_pandas(author_pddf,schema=author_schema)
|
author_table = pa.Table.from_pandas(author_pddf,schema=author_schema)
|
||||||
if table.shape[0] == 0:
|
do_break = True
|
||||||
break
|
|
||||||
|
if table.shape[0] != 0:
|
||||||
writer.write_table(table)
|
writer.write_table(table)
|
||||||
|
do_break = False
|
||||||
|
if author_table.shape[0] != 0:
|
||||||
author_writer.write_table(author_table)
|
author_writer.write_table(author_table)
|
||||||
|
do_break = False
|
||||||
|
|
||||||
|
if do_break:
|
||||||
|
break
|
||||||
|
|
||||||
writer.close()
|
writer.close()
|
||||||
author_writer.close()
|
author_writer.close()
|
||||||
|
@ -1,19 +1,19 @@
|
|||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
from similarities_helper import build_tfidf_dataset
|
from similarities_helper import build_tfidf_dataset
|
||||||
|
|
||||||
## TODO:need to exclude automoderator / bot posts.
|
|
||||||
## TODO:need to exclude better handle hyperlinks.
|
|
||||||
spark = SparkSession.builder.getOrCreate()
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
|
||||||
df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp/part-00000-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet")
|
df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp")
|
||||||
|
|
||||||
include_subs = set(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"))
|
include_subs = set(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"))
|
||||||
include_subs = {s.strip('\n') for s in include_subs}
|
include_subs = {s.strip('\n') for s in include_subs}
|
||||||
|
|
||||||
|
# remove [deleted] and AutoModerator (TODO remove other bots)
|
||||||
df = df.filter(df.author != '[deleted]')
|
df = df.filter(df.author != '[deleted]')
|
||||||
df = df.filter(df.author != 'AutoModerator')
|
df = df.filter(df.author != 'AutoModerator')
|
||||||
|
|
||||||
df = build_tfidf_dataset(df, include_subs, 'author')
|
df = build_tfidf_dataset(df, include_subs, 'author')
|
||||||
|
|
||||||
df.cache()
|
|
||||||
|
|
||||||
df.write.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_authors.parquet',mode='overwrite',compression='snappy')
|
df.write.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_authors.parquet',mode='overwrite',compression='snappy')
|
||||||
|
|
||||||
|
spark.stop()
|
||||||
|
@ -15,3 +15,4 @@ include_subs = {s.strip('\n') for s in include_subs}
|
|||||||
df = build_tfidf_dataset(df, include_subs, 'term')
|
df = build_tfidf_dataset(df, include_subs, 'term')
|
||||||
|
|
||||||
df.write.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf.parquet',mode='overwrite',compression='snappy')
|
df.write.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf.parquet',mode='overwrite',compression='snappy')
|
||||||
|
spark.stop()
|
||||||
|
Loading…
Reference in New Issue
Block a user