From 5045d6052ef89883b3c9e2c071273009edde12ab Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Tue, 3 Dec 2024 18:53:41 -0800 Subject: [PATCH] use post title and body in terms --- ngrams/term_frequencies.py | 9 ++++++--- ngrams/top_comment_phrases.py | 3 +++ 2 files changed, 9 insertions(+), 3 deletions(-) mode change 100644 => 100755 ngrams/top_comment_phrases.py diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index 4bf5497..ef2404f 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -100,8 +100,11 @@ def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords): tfs = Counter([]) authors = Counter([]) for post in posts: - tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords) - tfs.update(tokens) + title_tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords) + tfs.update(title_tokens) + if post.body is not None and post.body != "": + body_tokens = my_tokenizer(post.body, mwe_pass, mwe_tokenize, stopWords) + tfs.update(body_tokens) authors.update([post.author]) for term, tf in tfs.items(): @@ -144,7 +147,7 @@ def weekly_tf(partition, if reddit_dataset == 'comments': batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author']) if reddit_dataset == 'posts': - batches = dataset.to_batches(columns=['CreatedAt','subreddit','title','author']) + batches = dataset.to_batches(columns=['CreatedAt','subreddit','title','body','author']) schema = pa.schema([pa.field('subreddit', pa.string(), nullable=nullable_schema), pa.field('term', pa.string(), nullable=nullable_schema), diff --git a/ngrams/top_comment_phrases.py b/ngrams/top_comment_phrases.py old mode 100644 new mode 100755 index 031cba5..2884eec --- a/ngrams/top_comment_phrases.py +++ b/ngrams/top_comment_phrases.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 from pyspark.sql import functions as f from pyspark.sql import Window from pyspark.sql import SparkSession @@ -5,6 +6,8 @@ import numpy as np spark = SparkSession.builder.getOrCreate() df = spark.read.text("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/") +df2 = spark.read.text("/gscratch/comdata/users/nathante/reddit_post_ngrams_10p_sample/") +df = df.union(df2) df = df.withColumnRenamed("value","phrase")