1
0

use post title and body in terms

This commit is contained in:
Nathan TeBlunthuis 2024-12-03 18:53:41 -08:00
parent 51234f1070
commit 5045d6052e
2 changed files with 9 additions and 3 deletions

View File

@ -100,8 +100,11 @@ def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
tfs = Counter([]) tfs = Counter([])
authors = Counter([]) authors = Counter([])
for post in posts: for post in posts:
tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords) title_tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords)
tfs.update(tokens) tfs.update(title_tokens)
if post.body is not None and post.body != "":
body_tokens = my_tokenizer(post.body, mwe_pass, mwe_tokenize, stopWords)
tfs.update(body_tokens)
authors.update([post.author]) authors.update([post.author])
for term, tf in tfs.items(): for term, tf in tfs.items():
@ -144,7 +147,7 @@ def weekly_tf(partition,
if reddit_dataset == 'comments': if reddit_dataset == 'comments':
batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author']) batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author'])
if reddit_dataset == 'posts': if reddit_dataset == 'posts':
batches = dataset.to_batches(columns=['CreatedAt','subreddit','title','author']) batches = dataset.to_batches(columns=['CreatedAt','subreddit','title','body','author'])
schema = pa.schema([pa.field('subreddit', pa.string(), nullable=nullable_schema), schema = pa.schema([pa.field('subreddit', pa.string(), nullable=nullable_schema),
pa.field('term', pa.string(), nullable=nullable_schema), pa.field('term', pa.string(), nullable=nullable_schema),

3
ngrams/top_comment_phrases.py Normal file → Executable file
View File

@ -1,3 +1,4 @@
#!/usr/bin/env python3
from pyspark.sql import functions as f from pyspark.sql import functions as f
from pyspark.sql import Window from pyspark.sql import Window
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
@ -5,6 +6,8 @@ import numpy as np
spark = SparkSession.builder.getOrCreate() spark = SparkSession.builder.getOrCreate()
df = spark.read.text("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/") df = spark.read.text("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/")
df2 = spark.read.text("/gscratch/comdata/users/nathante/reddit_post_ngrams_10p_sample/")
df = df.union(df2)
df = df.withColumnRenamed("value","phrase") df = df.withColumnRenamed("value","phrase")