diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index ef2404f..c8c88de 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -84,7 +84,7 @@ def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords): tfs = Counter([]) authors = Counter([]) for post in posts: - tokens = my_tokenizer(post.body, mwe_pass, mwe_tokenize, stopWords) + tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords) tfs.update(tokens) authors.update([post.author]) @@ -102,9 +102,9 @@ def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords): for post in posts: title_tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords) tfs.update(title_tokens) - if post.body is not None and post.body != "": - body_tokens = my_tokenizer(post.body, mwe_pass, mwe_tokenize, stopWords) - tfs.update(body_tokens) + if post.selftext is not None and post.selftext != "": + selftext_tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords) + tfs.update(selftext_tokens) authors.update([post.author]) for term, tf in tfs.items(): @@ -147,7 +147,7 @@ def weekly_tf(partition, if reddit_dataset == 'comments': batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author']) if reddit_dataset == 'posts': - batches = dataset.to_batches(columns=['CreatedAt','subreddit','title','body','author']) + batches = dataset.to_batches(columns=['CreatedAt','subreddit','title','selftext','author']) schema = pa.schema([pa.field('subreddit', pa.string(), nullable=nullable_schema), pa.field('term', pa.string(), nullable=nullable_schema),