1
0
This commit is contained in:
Nathan TeBlunthuis 2024-12-03 19:16:49 -08:00
parent 4be8bb6bf5
commit 0436450ea8

View File

@ -63,7 +63,7 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords, ngram_output):
if random() <= 0.1: if random() <= 0.1:
grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4)))) grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
Path(ngram_output).mkdir(parents=True, exist_ok=True) Path(ngram_output).mkdir(parents=True, exist_ok=True)
with open(ngram_output','a') as gram_file: with open(ngram_output,'a') as gram_file:
for ng in grams: for ng in grams:
gram_file.write(' '.join(ng) + '\n') gram_file.write(' '.join(ng) + '\n')
for token in sentence: for token in sentence: