add submissions to timeseries.
This commit is contained in:
parent
81e12d1cef
commit
3c1d5df97e
@ -14,8 +14,13 @@ def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
|
||||
df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet")
|
||||
|
||||
df_comments = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet")
|
||||
df_submissions = spark.read.parquet("/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet")
|
||||
|
||||
df_comments = df_comments.select(['subreddit','CreatedAt','author'])
|
||||
df_submisisons = df_comments.select(['subreddit','CreatedAt','author'])
|
||||
df = df_comments.union(df_submisisons)
|
||||
|
||||
df = df.withColumn('week', f.date_trunc('week', f.col("CreatedAt")))
|
||||
|
||||
# time of unique authors by series by week
|
||||
|
Loading…
Reference in New Issue
Block a user