From 3c1d5df97e027e1dd27e964b45e25b5731c10323 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Fri, 10 Jan 2025 06:20:38 -0800 Subject: [PATCH] add submissions to timeseries. --- timeseries/cluster_timeseries.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/timeseries/cluster_timeseries.py b/timeseries/cluster_timeseries.py index 7ffcf0f..3b47474 100644 --- a/timeseries/cluster_timeseries.py +++ b/timeseries/cluster_timeseries.py @@ -14,8 +14,13 @@ def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit spark = SparkSession.builder.getOrCreate() - df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet") - + df_comments = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet") + df_submissions = spark.read.parquet("/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet") + + df_comments = df_comments.select(['subreddit','CreatedAt','author']) + df_submisisons = df_comments.select(['subreddit','CreatedAt','author']) + df = df_comments.union(df_submisisons) + df = df.withColumn('week', f.date_trunc('week', f.col("CreatedAt"))) # time of unique authors by series by week