pass through stopWords.
This commit is contained in:
parent
5d48c0eb55
commit
0d7f4d3cec
@ -30,7 +30,7 @@ def remove_punct(sentence):
|
||||
new_sentence.append(new_token)
|
||||
return new_sentence
|
||||
|
||||
def my_tokenizer(text, mwe_pass, mwe_tokenize):
|
||||
def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
|
||||
# remove stopwords, punctuation, urls, lower case
|
||||
# lowercase
|
||||
text = text.lower()
|
||||
@ -75,13 +75,13 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize):
|
||||
yield token
|
||||
|
||||
|
||||
def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize):
|
||||
def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
|
||||
for key, posts in subreddit_weeks:
|
||||
subreddit, week = key
|
||||
tfs = Counter([])
|
||||
authors = Counter([])
|
||||
for post in posts:
|
||||
tokens = my_tokenizer(post.body, mwe_pass, mwe_tokenize)
|
||||
tokens = my_tokenizer(post.body, mwe_pass, mwe_tokenize, stopWords)
|
||||
tfs.update(tokens)
|
||||
authors.update([post.author])
|
||||
|
||||
@ -91,13 +91,13 @@ def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize):
|
||||
for author, tf in authors.items():
|
||||
yield [False, subreddit, author, week, tf]
|
||||
|
||||
def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize):
|
||||
def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
|
||||
for key, posts in subreddit_weeks:
|
||||
subreddit, week = key
|
||||
tfs = Counter([])
|
||||
authors = Counter([])
|
||||
for post in posts:
|
||||
tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize)
|
||||
tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords)
|
||||
tfs.update(tokens)
|
||||
authors.update([post.author])
|
||||
|
||||
@ -184,7 +184,7 @@ def weekly_tf(partition,
|
||||
|
||||
# we follow the approach described in datta, phelan, adar 2017
|
||||
|
||||
outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize)
|
||||
outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords)
|
||||
|
||||
outchunksize = 10000
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user