From 89d03dd95650dfd3bb11637425ca601e0916759c Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 4 Dec 2024 09:24:45 -0800 Subject: [PATCH] consistent naming and bugfix. --- ngrams/term_frequencies.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index 69ad565..07ba90e 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -80,14 +80,14 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords, ngram_output): def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output): - for key, posts in subreddit_weeks: + for key, comments in subreddit_weeks: subreddit, week = key tfs = Counter([]) authors = Counter([]) - for post in posts: - tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords, ngram_output) + for comment in comments: + tokens = my_tokenizer(comment.body, mwe_pass, mwe_tokenize, stopWords, ngram_output) tfs.update(tokens) - authors.update([post.author]) + authors.update([comment.author]) for term, tf in tfs.items(): yield [True, subreddit, term, week, tf] @@ -261,7 +261,7 @@ def gen_task_list(mwe_pass='first', temp_output_tfidf_path="/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp/", output_terms_path="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", output_authors_path="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", - dataset='comments'): + reddit_dataset='comments'): files = os.listdir(input_parquet) curdir = Path('.')