1
0

pass through mwe_tokenize

This commit is contained in:
Nathan TeBlunthuis 2024-11-27 19:31:59 -08:00
parent 91cc1edf02
commit 5d48c0eb55

View File

@ -30,7 +30,7 @@ def remove_punct(sentence):
new_sentence.append(new_token)
return new_sentence
def my_tokenizer(text, mwe_pass):
def my_tokenizer(text, mwe_pass, mwe_tokenize):
# remove stopwords, punctuation, urls, lower case
# lowercase
text = text.lower()
@ -75,13 +75,13 @@ def my_tokenizer(text, mwe_pass):
yield token
def tf_comments(subreddit_weeks, mwe_pass):
def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize):
for key, posts in subreddit_weeks:
subreddit, week = key
tfs = Counter([])
authors = Counter([])
for post in posts:
tokens = my_tokenizer(post.body, mwe_pass)
tokens = my_tokenizer(post.body, mwe_pass, mwe_tokenize)
tfs.update(tokens)
authors.update([post.author])
@ -91,13 +91,13 @@ def tf_comments(subreddit_weeks, mwe_pass):
for author, tf in authors.items():
yield [False, subreddit, author, week, tf]
def tf_posts(subreddit_weeks, mwe_pass):
def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize):
for key, posts in subreddit_weeks:
subreddit, week = key
tfs = Counter([])
authors = Counter([])
for post in posts:
tokens = my_tokenizer(post.title, mwe_pass)
tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize)
tfs.update(tokens)
authors.update([post.author])
@ -184,7 +184,7 @@ def weekly_tf(partition,
# we follow the approach described in datta, phelan, adar 2017
outrows = tf_func(subreddit_weeks, mwe_pass)
outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize)
outchunksize = 10000