debugging.
This commit is contained in:
parent
22d6a6961c
commit
4218bf864b
@ -33,6 +33,9 @@ def remove_punct(sentence):
|
||||
def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
|
||||
# remove stopwords, punctuation, urls, lower case
|
||||
# lowercase
|
||||
if text is None:
|
||||
return ""
|
||||
|
||||
text = text.lower()
|
||||
|
||||
# remove urls
|
||||
@ -122,7 +125,8 @@ def weekly_tf(partition,
|
||||
nullable_schema = False
|
||||
elif reddit_dataset == 'posts':
|
||||
tf_func = tf_posts
|
||||
nullable_schema = True
|
||||
nullable_schema = False
|
||||
|
||||
dataset = ds.dataset(f"{input_parquet}/{partition}", format='parquet')
|
||||
if not os.path.exists(output_10p_sample_path):
|
||||
os.mkdir(output_10p_sample_path)
|
||||
@ -209,6 +213,7 @@ def weekly_tf(partition,
|
||||
if table.shape[0] != 0:
|
||||
writer.write_table(table)
|
||||
do_break = False
|
||||
|
||||
if author_table.shape[0] != 0:
|
||||
author_writer.write_table(author_table)
|
||||
do_break = False
|
||||
|
Loading…
Reference in New Issue
Block a user