diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index 9703f3c..c6d0be3 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -33,6 +33,9 @@ def remove_punct(sentence): def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords): # remove stopwords, punctuation, urls, lower case # lowercase + if text is None: + return "" + text = text.lower() # remove urls @@ -122,7 +125,8 @@ def weekly_tf(partition, nullable_schema = False elif reddit_dataset == 'posts': tf_func = tf_posts - nullable_schema = True + nullable_schema = False + dataset = ds.dataset(f"{input_parquet}/{partition}", format='parquet') if not os.path.exists(output_10p_sample_path): os.mkdir(output_10p_sample_path) @@ -209,6 +213,7 @@ def weekly_tf(partition, if table.shape[0] != 0: writer.write_table(table) do_break = False + if author_table.shape[0] != 0: author_writer.write_table(author_table) do_break = False