diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index c8c88de..21828a7 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -58,6 +58,7 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords): # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms # here we take a 10 percent sample of sentences if mwe_pass == 'first': + ngram_output = partition.replace("parquet","txt") sentences = list(sentences) for sentence in sentences: if random() <= 0.1: