1
0

debugging.

This commit is contained in:
Nathan TeBlunthuis 2024-12-01 09:39:50 -08:00
parent 22d6a6961c
commit 4218bf864b

View File

@ -33,6 +33,9 @@ def remove_punct(sentence):
def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords): def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
# remove stopwords, punctuation, urls, lower case # remove stopwords, punctuation, urls, lower case
# lowercase # lowercase
if text is None:
return ""
text = text.lower() text = text.lower()
# remove urls # remove urls
@ -122,7 +125,8 @@ def weekly_tf(partition,
nullable_schema = False nullable_schema = False
elif reddit_dataset == 'posts': elif reddit_dataset == 'posts':
tf_func = tf_posts tf_func = tf_posts
nullable_schema = True nullable_schema = False
dataset = ds.dataset(f"{input_parquet}/{partition}", format='parquet') dataset = ds.dataset(f"{input_parquet}/{partition}", format='parquet')
if not os.path.exists(output_10p_sample_path): if not os.path.exists(output_10p_sample_path):
os.mkdir(output_10p_sample_path) os.mkdir(output_10p_sample_path)
@ -209,6 +213,7 @@ def weekly_tf(partition,
if table.shape[0] != 0: if table.shape[0] != 0:
writer.write_table(table) writer.write_table(table)
do_break = False do_break = False
if author_table.shape[0] != 0: if author_table.shape[0] != 0:
author_writer.write_table(author_table) author_writer.write_table(author_table)
do_break = False do_break = False