1
0

move function to outer scope.

This commit is contained in:
Nathan TeBlunthuis 2024-11-27 19:13:49 -08:00
parent 7da046735b
commit 2decdc9750

View File

@ -19,6 +19,17 @@ from pathlib import Path
# taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
urlregex = re.compile(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")
def remove_punct(sentence):
new_sentence = []
for token in sentence:
new_token = ''
for c in token:
if c not in string.punctuation:
new_token += c
if len(new_token) > 0:
new_sentence.append(new_token)
return new_sentence
def my_tokenizer(text):
# remove stopwords, punctuation, urls, lower case
# lowercase
@ -169,17 +180,6 @@ def weekly_tf(partition,
else:
mwe_tokenize = MWETokenizer().tokenize
def remove_punct(sentence):
new_sentence = []
for token in sentence:
new_token = ''
for c in token:
if c not in string.punctuation:
new_token += c
if len(new_token) > 0:
new_sentence.append(new_token)
return new_sentence
stopWords = set(stopwords.words('english'))
# we follow the approach described in datta, phelan, adar 2017