consistent naming and bugfix.
This commit is contained in:
parent
472849ebd9
commit
89d03dd956
@ -80,14 +80,14 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords, ngram_output):
|
|||||||
|
|
||||||
|
|
||||||
def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output):
|
def tf_comments(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output):
|
||||||
for key, posts in subreddit_weeks:
|
for key, comments in subreddit_weeks:
|
||||||
subreddit, week = key
|
subreddit, week = key
|
||||||
tfs = Counter([])
|
tfs = Counter([])
|
||||||
authors = Counter([])
|
authors = Counter([])
|
||||||
for post in posts:
|
for comment in comments:
|
||||||
tokens = my_tokenizer(post.selftext, mwe_pass, mwe_tokenize, stopWords, ngram_output)
|
tokens = my_tokenizer(comment.body, mwe_pass, mwe_tokenize, stopWords, ngram_output)
|
||||||
tfs.update(tokens)
|
tfs.update(tokens)
|
||||||
authors.update([post.author])
|
authors.update([comment.author])
|
||||||
|
|
||||||
for term, tf in tfs.items():
|
for term, tf in tfs.items():
|
||||||
yield [True, subreddit, term, week, tf]
|
yield [True, subreddit, term, week, tf]
|
||||||
@ -261,7 +261,7 @@ def gen_task_list(mwe_pass='first',
|
|||||||
temp_output_tfidf_path="/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp/",
|
temp_output_tfidf_path="/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp/",
|
||||||
output_terms_path="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
|
output_terms_path="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
|
||||||
output_authors_path="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
|
output_authors_path="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
|
||||||
dataset='comments'):
|
reddit_dataset='comments'):
|
||||||
files = os.listdir(input_parquet)
|
files = os.listdir(input_parquet)
|
||||||
|
|
||||||
curdir = Path('.')
|
curdir = Path('.')
|
||||||
|
Loading…
Reference in New Issue
Block a user