diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index 9d43493..2723b1f 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -30,7 +30,7 @@ def remove_punct(sentence): new_sentence.append(new_token) return new_sentence -def my_tokenizer(text): +def my_tokenizer(text, mwe_pass): # remove stopwords, punctuation, urls, lower case # lowercase text = text.lower() @@ -75,13 +75,13 @@ def my_tokenizer(text): yield token -def tf_comments(subreddit_weeks): +def tf_comments(subreddit_weeks, mwe_pass): for key, posts in subreddit_weeks: subreddit, week = key tfs = Counter([]) authors = Counter([]) for post in posts: - tokens = my_tokenizer(post.body) + tokens = my_tokenizer(post.body, mwe_pass) tfs.update(tokens) authors.update([post.author]) @@ -91,13 +91,13 @@ def tf_comments(subreddit_weeks): for author, tf in authors.items(): yield [False, subreddit, author, week, tf] -def tf_posts(subreddit_weeks): +def tf_posts(subreddit_weeks, mwe_pass): for key, posts in subreddit_weeks: subreddit, week = key tfs = Counter([]) authors = Counter([]) for post in posts: - tokens = my_tokenizer(post.title) + tokens = my_tokenizer(post.title, mwe_pass) tfs.update(tokens) authors.update([post.author]) @@ -184,7 +184,7 @@ def weekly_tf(partition, # we follow the approach described in datta, phelan, adar 2017 - outrows = tf_func(subreddit_weeks) + outrows = tf_func(subreddit_weeks, mwe_pass) outchunksize = 10000