pass through mwe_pass
This commit is contained in:
parent
2decdc9750
commit
91cc1edf02
@ -30,7 +30,7 @@ def remove_punct(sentence):
|
|||||||
new_sentence.append(new_token)
|
new_sentence.append(new_token)
|
||||||
return new_sentence
|
return new_sentence
|
||||||
|
|
||||||
def my_tokenizer(text):
|
def my_tokenizer(text, mwe_pass):
|
||||||
# remove stopwords, punctuation, urls, lower case
|
# remove stopwords, punctuation, urls, lower case
|
||||||
# lowercase
|
# lowercase
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
@ -75,13 +75,13 @@ def my_tokenizer(text):
|
|||||||
yield token
|
yield token
|
||||||
|
|
||||||
|
|
||||||
def tf_comments(subreddit_weeks):
|
def tf_comments(subreddit_weeks, mwe_pass):
|
||||||
for key, posts in subreddit_weeks:
|
for key, posts in subreddit_weeks:
|
||||||
subreddit, week = key
|
subreddit, week = key
|
||||||
tfs = Counter([])
|
tfs = Counter([])
|
||||||
authors = Counter([])
|
authors = Counter([])
|
||||||
for post in posts:
|
for post in posts:
|
||||||
tokens = my_tokenizer(post.body)
|
tokens = my_tokenizer(post.body, mwe_pass)
|
||||||
tfs.update(tokens)
|
tfs.update(tokens)
|
||||||
authors.update([post.author])
|
authors.update([post.author])
|
||||||
|
|
||||||
@ -91,13 +91,13 @@ def tf_comments(subreddit_weeks):
|
|||||||
for author, tf in authors.items():
|
for author, tf in authors.items():
|
||||||
yield [False, subreddit, author, week, tf]
|
yield [False, subreddit, author, week, tf]
|
||||||
|
|
||||||
def tf_posts(subreddit_weeks):
|
def tf_posts(subreddit_weeks, mwe_pass):
|
||||||
for key, posts in subreddit_weeks:
|
for key, posts in subreddit_weeks:
|
||||||
subreddit, week = key
|
subreddit, week = key
|
||||||
tfs = Counter([])
|
tfs = Counter([])
|
||||||
authors = Counter([])
|
authors = Counter([])
|
||||||
for post in posts:
|
for post in posts:
|
||||||
tokens = my_tokenizer(post.title)
|
tokens = my_tokenizer(post.title, mwe_pass)
|
||||||
tfs.update(tokens)
|
tfs.update(tokens)
|
||||||
authors.update([post.author])
|
authors.update([post.author])
|
||||||
|
|
||||||
@ -184,7 +184,7 @@ def weekly_tf(partition,
|
|||||||
|
|
||||||
# we follow the approach described in datta, phelan, adar 2017
|
# we follow the approach described in datta, phelan, adar 2017
|
||||||
|
|
||||||
outrows = tf_func(subreddit_weeks)
|
outrows = tf_func(subreddit_weeks, mwe_pass)
|
||||||
|
|
||||||
outchunksize = 10000
|
outchunksize = 10000
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user