bugfix.

2024-12-03 19:02:26 -08:00 · 2024-12-03 19:02:26 -08:00 · a179d608eb
commit a179d608eb
parent 73dd2a96a6
1 changed files with 1 additions and 0 deletions
--- a/ngrams/term_frequencies.py
+++ b/ngrams/term_frequencies.py
@ -58,6 +58,7 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
    # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
    # here we take a 10 percent sample of sentences 
    if mwe_pass == 'first':
+        ngram_output = partition.replace("parquet","txt")
        sentences = list(sentences)
        for sentence in sentences:
            if random() <= 0.1: