debugging.
This commit is contained in:
parent
22d6a6961c
commit
4218bf864b
@ -33,6 +33,9 @@ def remove_punct(sentence):
|
|||||||
def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
|
def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
|
||||||
# remove stopwords, punctuation, urls, lower case
|
# remove stopwords, punctuation, urls, lower case
|
||||||
# lowercase
|
# lowercase
|
||||||
|
if text is None:
|
||||||
|
return ""
|
||||||
|
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
|
|
||||||
# remove urls
|
# remove urls
|
||||||
@ -122,7 +125,8 @@ def weekly_tf(partition,
|
|||||||
nullable_schema = False
|
nullable_schema = False
|
||||||
elif reddit_dataset == 'posts':
|
elif reddit_dataset == 'posts':
|
||||||
tf_func = tf_posts
|
tf_func = tf_posts
|
||||||
nullable_schema = True
|
nullable_schema = False
|
||||||
|
|
||||||
dataset = ds.dataset(f"{input_parquet}/{partition}", format='parquet')
|
dataset = ds.dataset(f"{input_parquet}/{partition}", format='parquet')
|
||||||
if not os.path.exists(output_10p_sample_path):
|
if not os.path.exists(output_10p_sample_path):
|
||||||
os.mkdir(output_10p_sample_path)
|
os.mkdir(output_10p_sample_path)
|
||||||
@ -209,6 +213,7 @@ def weekly_tf(partition,
|
|||||||
if table.shape[0] != 0:
|
if table.shape[0] != 0:
|
||||||
writer.write_table(table)
|
writer.write_table(table)
|
||||||
do_break = False
|
do_break = False
|
||||||
|
|
||||||
if author_table.shape[0] != 0:
|
if author_table.shape[0] != 0:
|
||||||
author_writer.write_table(author_table)
|
author_writer.write_table(author_table)
|
||||||
do_break = False
|
do_break = False
|
||||||
|
Loading…
Reference in New Issue
Block a user