move function to outer scope.
This commit is contained in:
parent
7da046735b
commit
2decdc9750
@ -19,6 +19,17 @@ from pathlib import Path
|
||||
# taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
|
||||
urlregex = re.compile(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")
|
||||
|
||||
def remove_punct(sentence):
|
||||
new_sentence = []
|
||||
for token in sentence:
|
||||
new_token = ''
|
||||
for c in token:
|
||||
if c not in string.punctuation:
|
||||
new_token += c
|
||||
if len(new_token) > 0:
|
||||
new_sentence.append(new_token)
|
||||
return new_sentence
|
||||
|
||||
def my_tokenizer(text):
|
||||
# remove stopwords, punctuation, urls, lower case
|
||||
# lowercase
|
||||
@ -169,17 +180,6 @@ def weekly_tf(partition,
|
||||
else:
|
||||
mwe_tokenize = MWETokenizer().tokenize
|
||||
|
||||
def remove_punct(sentence):
|
||||
new_sentence = []
|
||||
for token in sentence:
|
||||
new_token = ''
|
||||
for c in token:
|
||||
if c not in string.punctuation:
|
||||
new_token += c
|
||||
if len(new_token) > 0:
|
||||
new_sentence.append(new_token)
|
||||
return new_sentence
|
||||
|
||||
stopWords = set(stopwords.words('english'))
|
||||
|
||||
# we follow the approach described in datta, phelan, adar 2017
|
||||
|
Loading…
Reference in New Issue
Block a user