1
0

pass ngram_output through.

This commit is contained in:
Nathan TeBlunthuis 2024-12-03 19:05:44 -08:00
parent a179d608eb
commit ec5859c311

View File

@ -30,7 +30,7 @@ def remove_punct(sentence):
new_sentence.append(new_token) new_sentence.append(new_token)
return new_sentence return new_sentence
def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords): def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords, ngram_output):
# remove stopwords, punctuation, urls, lower case # remove stopwords, punctuation, urls, lower case
# lowercase # lowercase
if text is None: if text is None:
@ -58,7 +58,6 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
# they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
# here we take a 10 percent sample of sentences # here we take a 10 percent sample of sentences
if mwe_pass == 'first': if mwe_pass == 'first':
ngram_output = partition.replace("parquet","txt")
sentences = list(sentences) sentences = list(sentences)
for sentence in sentences: for sentence in sentences:
if random() <= 0.1: if random() <= 0.1:
@ -79,13 +78,13 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords):
yield token yield token
def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords): def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output):
for key, posts in subreddit_weeks: for key, posts in subreddit_weeks:
subreddit, week = key subreddit, week = key
tfs = Counter([]) tfs = Counter([])
authors = Counter([]) authors = Counter([])
for post in posts: for post in posts:
tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords) tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords, ngram_output)
tfs.update(tokens) tfs.update(tokens)
authors.update([post.author]) authors.update([post.author])
@ -95,16 +94,16 @@ def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
for author, tf in authors.items(): for author, tf in authors.items():
yield [False, subreddit, author, week, tf] yield [False, subreddit, author, week, tf]
def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords): def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output):
for key, posts in subreddit_weeks: for key, posts in subreddit_weeks:
subreddit, week = key subreddit, week = key
tfs = Counter([]) tfs = Counter([])
authors = Counter([]) authors = Counter([])
for post in posts: for post in posts:
title_tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords) title_tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords, ngram_output)
tfs.update(title_tokens) tfs.update(title_tokens)
if post.selftext is not None and post.selftext != "": if post.selftext is not None and post.selftext != "":
selftext_tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords) selftext_tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords, ngram_output)
tfs.update(selftext_tokens) tfs.update(selftext_tokens)
authors.update([post.author]) authors.update([post.author])
@ -194,7 +193,7 @@ def weekly_tf(partition,
# we follow the approach described in datta, phelan, adar 2017 # we follow the approach described in datta, phelan, adar 2017
outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords) outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output)
outchunksize = 100000 outchunksize = 100000