diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index 21828a7..38de903 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -30,7 +30,7 @@ def remove_punct(sentence): new_sentence.append(new_token) return new_sentence -def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords): +def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords, ngram_output): # remove stopwords, punctuation, urls, lower case # lowercase if text is None: @@ -58,7 +58,6 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords): # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms # here we take a 10 percent sample of sentences if mwe_pass == 'first': - ngram_output = partition.replace("parquet","txt") sentences = list(sentences) for sentence in sentences: if random() <= 0.1: @@ -79,13 +78,13 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords): yield token -def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords): +def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output): for key, posts in subreddit_weeks: subreddit, week = key tfs = Counter([]) authors = Counter([]) for post in posts: - tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords) + tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords, ngram_output) tfs.update(tokens) authors.update([post.author]) @@ -95,16 +94,16 @@ def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords): for author, tf in authors.items(): yield [False, subreddit, author, week, tf] -def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords): +def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output): for key, posts in subreddit_weeks: subreddit, week = key tfs = Counter([]) authors = Counter([]) for post in posts: - title_tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords) + title_tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords, ngram_output) tfs.update(title_tokens) if post.selftext is not None and post.selftext != "": - selftext_tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords) + selftext_tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords, ngram_output) tfs.update(selftext_tokens) authors.update([post.author]) @@ -194,7 +193,7 @@ def weekly_tf(partition, # we follow the approach described in datta, phelan, adar 2017 - outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords) + outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output) outchunksize = 100000