From 2decdc97502891a194df6b385935a71c2cc11edf Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 27 Nov 2024 19:13:49 -0800 Subject: [PATCH] move function to outer scope. --- ngrams/term_frequencies.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index c30131c..9d43493 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -19,6 +19,17 @@ from pathlib import Path # taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url urlregex = re.compile(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)") +def remove_punct(sentence): + new_sentence = [] + for token in sentence: + new_token = '' + for c in token: + if c not in string.punctuation: + new_token += c + if len(new_token) > 0: + new_sentence.append(new_token) + return new_sentence + def my_tokenizer(text): # remove stopwords, punctuation, urls, lower case # lowercase @@ -169,17 +180,6 @@ def weekly_tf(partition, else: mwe_tokenize = MWETokenizer().tokenize - def remove_punct(sentence): - new_sentence = [] - for token in sentence: - new_token = '' - for c in token: - if c not in string.punctuation: - new_token += c - if len(new_token) > 0: - new_sentence.append(new_token) - return new_sentence - stopWords = set(stopwords.words('english')) # we follow the approach described in datta, phelan, adar 2017