add a 'limit' parameter for testing.
This commit is contained in:
parent
4218bf864b
commit
271cbea7d9
@ -118,7 +118,8 @@ def weekly_tf(partition,
|
|||||||
temp_output_tfidf_path="/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp/",
|
temp_output_tfidf_path="/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp/",
|
||||||
output_terms_path="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
|
output_terms_path="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
|
||||||
output_authors_path="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
|
output_authors_path="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
|
||||||
reddit_dataset = 'comments'):
|
reddit_dataset = 'comments',
|
||||||
|
limit = None):
|
||||||
|
|
||||||
if reddit_dataset == 'comments':
|
if reddit_dataset == 'comments':
|
||||||
tf_func = tf_comments
|
tf_func = tf_comments
|
||||||
@ -195,10 +196,18 @@ def weekly_tf(partition,
|
|||||||
|
|
||||||
Path(output_terms_path).mkdir(parents=True, exist_ok=True)
|
Path(output_terms_path).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
if limit is not None:
|
||||||
|
n_lines_out = 0
|
||||||
|
|
||||||
with pq.ParquetWriter(f"{output_terms_path}/{partition}",schema=schema,compression='snappy',flavor='spark') as writer, pq.ParquetWriter(f"{output_authors_path}/{partition}",schema=author_schema,compression='snappy',flavor='spark') as author_writer:
|
with pq.ParquetWriter(f"{output_terms_path}/{partition}",schema=schema,compression='snappy',flavor='spark') as writer, pq.ParquetWriter(f"{output_authors_path}/{partition}",schema=author_schema,compression='snappy',flavor='spark') as author_writer:
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
|
if limit is not None:
|
||||||
|
n_lines_left = limit - n_lines_out
|
||||||
|
if n_lines_left < outchunksize:
|
||||||
|
outchunksize = n_lines_left
|
||||||
|
|
||||||
chunk = islice(outrows,outchunksize)
|
chunk = islice(outrows,outchunksize)
|
||||||
chunk = (c for c in chunk if c[1] is not None)
|
chunk = (c for c in chunk if c[1] is not None)
|
||||||
pddf = pd.DataFrame(chunk, columns=["is_token"] + schema.names)
|
pddf = pd.DataFrame(chunk, columns=["is_token"] + schema.names)
|
||||||
@ -218,6 +227,12 @@ def weekly_tf(partition,
|
|||||||
author_writer.write_table(author_table)
|
author_writer.write_table(author_table)
|
||||||
do_break = False
|
do_break = False
|
||||||
|
|
||||||
|
if limit is not None:
|
||||||
|
if n_lines_out < limit:
|
||||||
|
n_lines_out += outchunksize
|
||||||
|
else:
|
||||||
|
do_break = True
|
||||||
|
|
||||||
if do_break:
|
if do_break:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user