pass ngram_output through.
This commit is contained in:
parent
a179d608eb
commit
ec5859c311
@ -30,7 +30,7 @@ def remove_punct(sentence):
|
|||||||
new_sentence.append(new_token)
|
new_sentence.append(new_token)
|
||||||
return new_sentence
|
return new_sentence
|
||||||
|
|
||||||
def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
|
def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords, ngram_output):
|
||||||
# remove stopwords, punctuation, urls, lower case
|
# remove stopwords, punctuation, urls, lower case
|
||||||
# lowercase
|
# lowercase
|
||||||
if text is None:
|
if text is None:
|
||||||
@ -58,7 +58,6 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
|
|||||||
# they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
|
# they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
|
||||||
# here we take a 10 percent sample of sentences
|
# here we take a 10 percent sample of sentences
|
||||||
if mwe_pass == 'first':
|
if mwe_pass == 'first':
|
||||||
ngram_output = partition.replace("parquet","txt")
|
|
||||||
sentences = list(sentences)
|
sentences = list(sentences)
|
||||||
for sentence in sentences:
|
for sentence in sentences:
|
||||||
if random() <= 0.1:
|
if random() <= 0.1:
|
||||||
@ -79,13 +78,13 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
|
|||||||
yield token
|
yield token
|
||||||
|
|
||||||
|
|
||||||
def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
|
def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output):
|
||||||
for key, posts in subreddit_weeks:
|
for key, posts in subreddit_weeks:
|
||||||
subreddit, week = key
|
subreddit, week = key
|
||||||
tfs = Counter([])
|
tfs = Counter([])
|
||||||
authors = Counter([])
|
authors = Counter([])
|
||||||
for post in posts:
|
for post in posts:
|
||||||
tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords)
|
tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords, ngram_output)
|
||||||
tfs.update(tokens)
|
tfs.update(tokens)
|
||||||
authors.update([post.author])
|
authors.update([post.author])
|
||||||
|
|
||||||
@ -95,16 +94,16 @@ def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
|
|||||||
for author, tf in authors.items():
|
for author, tf in authors.items():
|
||||||
yield [False, subreddit, author, week, tf]
|
yield [False, subreddit, author, week, tf]
|
||||||
|
|
||||||
def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
|
def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output):
|
||||||
for key, posts in subreddit_weeks:
|
for key, posts in subreddit_weeks:
|
||||||
subreddit, week = key
|
subreddit, week = key
|
||||||
tfs = Counter([])
|
tfs = Counter([])
|
||||||
authors = Counter([])
|
authors = Counter([])
|
||||||
for post in posts:
|
for post in posts:
|
||||||
title_tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords)
|
title_tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords, ngram_output)
|
||||||
tfs.update(title_tokens)
|
tfs.update(title_tokens)
|
||||||
if post.selftext is not None and post.selftext != "":
|
if post.selftext is not None and post.selftext != "":
|
||||||
selftext_tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords)
|
selftext_tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords, ngram_output)
|
||||||
tfs.update(selftext_tokens)
|
tfs.update(selftext_tokens)
|
||||||
authors.update([post.author])
|
authors.update([post.author])
|
||||||
|
|
||||||
@ -194,7 +193,7 @@ def weekly_tf(partition,
|
|||||||
|
|
||||||
# we follow the approach described in datta, phelan, adar 2017
|
# we follow the approach described in datta, phelan, adar 2017
|
||||||
|
|
||||||
outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords)
|
outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output)
|
||||||
|
|
||||||
outchunksize = 100000
|
outchunksize = 100000
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user