1
0

add submissions to timeseries.

This commit is contained in:
Nathan TeBlunthuis 2025-01-10 06:20:38 -08:00
parent 81e12d1cef
commit 3c1d5df97e

View File

@ -14,7 +14,12 @@ def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit
spark = SparkSession.builder.getOrCreate() spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet") df_comments = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet")
df_submissions = spark.read.parquet("/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet")
df_comments = df_comments.select(['subreddit','CreatedAt','author'])
df_submisisons = df_comments.select(['subreddit','CreatedAt','author'])
df = df_comments.union(df_submisisons)
df = df.withColumn('week', f.date_trunc('week', f.col("CreatedAt"))) df = df.withColumn('week', f.date_trunc('week', f.col("CreatedAt")))