1
0

pass through stopWords.

This commit is contained in:
Nathan TeBlunthuis 2024-11-27 19:33:28 -08:00
parent 5d48c0eb55
commit 0d7f4d3cec

View File

@ -30,7 +30,7 @@ def remove_punct(sentence):
new_sentence.append(new_token) new_sentence.append(new_token)
return new_sentence return new_sentence
def my_tokenizer(text, mwe_pass, mwe_tokenize): def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
# remove stopwords, punctuation, urls, lower case # remove stopwords, punctuation, urls, lower case
# lowercase # lowercase
text = text.lower() text = text.lower()
@ -75,13 +75,13 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize):
yield token yield token
def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize): def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
for key, posts in subreddit_weeks: for key, posts in subreddit_weeks:
subreddit, week = key subreddit, week = key
tfs = Counter([]) tfs = Counter([])
authors = Counter([]) authors = Counter([])
for post in posts: for post in posts:
tokens = my_tokenizer(post.body, mwe_pass, mwe_tokenize) tokens = my_tokenizer(post.body, mwe_pass, mwe_tokenize, stopWords)
tfs.update(tokens) tfs.update(tokens)
authors.update([post.author]) authors.update([post.author])
@ -91,13 +91,13 @@ def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize):
for author, tf in authors.items(): for author, tf in authors.items():
yield [False, subreddit, author, week, tf] yield [False, subreddit, author, week, tf]
def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize): def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
for key, posts in subreddit_weeks: for key, posts in subreddit_weeks:
subreddit, week = key subreddit, week = key
tfs = Counter([]) tfs = Counter([])
authors = Counter([]) authors = Counter([])
for post in posts: for post in posts:
tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize) tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords)
tfs.update(tokens) tfs.update(tokens)
authors.update([post.author]) authors.update([post.author])
@ -184,7 +184,7 @@ def weekly_tf(partition,
# we follow the approach described in datta, phelan, adar 2017 # we follow the approach described in datta, phelan, adar 2017
outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize) outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords)
outchunksize = 10000 outchunksize = 10000