use post title and body in terms
This commit is contained in:
parent
51234f1070
commit
5045d6052e
@ -100,8 +100,11 @@ def tf_posts(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords):
|
|||||||
tfs = Counter([])
|
tfs = Counter([])
|
||||||
authors = Counter([])
|
authors = Counter([])
|
||||||
for post in posts:
|
for post in posts:
|
||||||
tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords)
|
title_tokens = my_tokenizer(post.title, mwe_pass, mwe_tokenize, stopWords)
|
||||||
tfs.update(tokens)
|
tfs.update(title_tokens)
|
||||||
|
if post.body is not None and post.body != "":
|
||||||
|
body_tokens = my_tokenizer(post.body, mwe_pass, mwe_tokenize, stopWords)
|
||||||
|
tfs.update(body_tokens)
|
||||||
authors.update([post.author])
|
authors.update([post.author])
|
||||||
|
|
||||||
for term, tf in tfs.items():
|
for term, tf in tfs.items():
|
||||||
@ -144,7 +147,7 @@ def weekly_tf(partition,
|
|||||||
if reddit_dataset == 'comments':
|
if reddit_dataset == 'comments':
|
||||||
batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author'])
|
batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author'])
|
||||||
if reddit_dataset == 'posts':
|
if reddit_dataset == 'posts':
|
||||||
batches = dataset.to_batches(columns=['CreatedAt','subreddit','title','author'])
|
batches = dataset.to_batches(columns=['CreatedAt','subreddit','title','body','author'])
|
||||||
|
|
||||||
schema = pa.schema([pa.field('subreddit', pa.string(), nullable=nullable_schema),
|
schema = pa.schema([pa.field('subreddit', pa.string(), nullable=nullable_schema),
|
||||||
pa.field('term', pa.string(), nullable=nullable_schema),
|
pa.field('term', pa.string(), nullable=nullable_schema),
|
||||||
|
3
ngrams/top_comment_phrases.py
Normal file → Executable file
3
ngrams/top_comment_phrases.py
Normal file → Executable file
@ -1,3 +1,4 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
from pyspark.sql import functions as f
|
from pyspark.sql import functions as f
|
||||||
from pyspark.sql import Window
|
from pyspark.sql import Window
|
||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
@ -5,6 +6,8 @@ import numpy as np
|
|||||||
|
|
||||||
spark = SparkSession.builder.getOrCreate()
|
spark = SparkSession.builder.getOrCreate()
|
||||||
df = spark.read.text("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/")
|
df = spark.read.text("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/")
|
||||||
|
df2 = spark.read.text("/gscratch/comdata/users/nathante/reddit_post_ngrams_10p_sample/")
|
||||||
|
df = df.union(df2)
|
||||||
|
|
||||||
df = df.withColumnRenamed("value","phrase")
|
df = df.withColumnRenamed("value","phrase")
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user