1
0

it's selftext not body

This commit is contained in:
Nathan TeBlunthuis 2024-12-03 18:59:27 -08:00
parent 5045d6052e
commit 73dd2a96a6

View File

@ -84,7 +84,7 @@ def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
tfs = Counter([])
authors = Counter([])
for post in posts:
tokens = my_tokenizer(post.body, mwe_pass, mwe_tokenize, stopWords)
tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords)
tfs.update(tokens)
authors.update([post.author])
@ -102,9 +102,9 @@ def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
for post in posts:
title_tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords)
tfs.update(title_tokens)
if post.body is not None and post.body != "":
body_tokens = my_tokenizer(post.body, mwe_pass, mwe_tokenize, stopWords)
tfs.update(body_tokens)
if post.selftext is not None and post.selftext != "":
selftext_tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords)
tfs.update(selftext_tokens)
authors.update([post.author])
for term, tf in tfs.items():
@ -147,7 +147,7 @@ def weekly_tf(partition,
if reddit_dataset == 'comments':
batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author'])
if reddit_dataset == 'posts':
batches = dataset.to_batches(columns=['CreatedAt','subreddit','title','body','author'])
batches = dataset.to_batches(columns=['CreatedAt','subreddit','title','selftext','author'])
schema = pa.schema([pa.field('subreddit', pa.string(), nullable=nullable_schema),
pa.field('term', pa.string(), nullable=nullable_schema),