pass ngram_output through.
This commit is contained in:
parent
a179d608eb
commit
ec5859c311
@ -30,7 +30,7 @@ def remove_punct(sentence):
|
||||
new_sentence.append(new_token)
|
||||
return new_sentence
|
||||
|
||||
def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
|
||||
def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords, ngram_output):
|
||||
# remove stopwords, punctuation, urls, lower case
|
||||
# lowercase
|
||||
if text is None:
|
||||
@ -58,7 +58,6 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
|
||||
# they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
|
||||
# here we take a 10 percent sample of sentences
|
||||
if mwe_pass == 'first':
|
||||
ngram_output = partition.replace("parquet","txt")
|
||||
sentences = list(sentences)
|
||||
for sentence in sentences:
|
||||
if random() <= 0.1:
|
||||
@ -79,13 +78,13 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
|
||||
yield token
|
||||
|
||||
|
||||
def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
|
||||
def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output):
|
||||
for key, posts in subreddit_weeks:
|
||||
subreddit, week = key
|
||||
tfs = Counter([])
|
||||
authors = Counter([])
|
||||
for post in posts:
|
||||
tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords)
|
||||
tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords, ngram_output)
|
||||
tfs.update(tokens)
|
||||
authors.update([post.author])
|
||||
|
||||
@ -95,16 +94,16 @@ def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
|
||||
for author, tf in authors.items():
|
||||
yield [False, subreddit, author, week, tf]
|
||||
|
||||
def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
|
||||
def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output):
|
||||
for key, posts in subreddit_weeks:
|
||||
subreddit, week = key
|
||||
tfs = Counter([])
|
||||
authors = Counter([])
|
||||
for post in posts:
|
||||
title_tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords)
|
||||
title_tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords, ngram_output)
|
||||
tfs.update(title_tokens)
|
||||
if post.selftext is not None and post.selftext != "":
|
||||
selftext_tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords)
|
||||
selftext_tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords, ngram_output)
|
||||
tfs.update(selftext_tokens)
|
||||
authors.update([post.author])
|
||||
|
||||
@ -194,7 +193,7 @@ def weekly_tf(partition,
|
||||
|
||||
# we follow the approach described in datta, phelan, adar 2017
|
||||
|
||||
outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords)
|
||||
outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output)
|
||||
|
||||
outchunksize = 100000
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user