1
0

use duckdb not spark to prepare for weekly similarities.

This commit is contained in:
Nathan TeBlunthuis 2024-12-28 13:45:17 -08:00
parent 74ee86e443
commit 104b708ff6

View File

@ -3,6 +3,7 @@ from pyspark.sql import functions as f
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
from pyspark.sql import Window from pyspark.sql import Window
import numpy as np import numpy as np
import duckdb
import pyarrow import pyarrow
import pyarrow.dataset as ds import pyarrow.dataset as ds
import pandas as pd import pandas as pd
@ -72,22 +73,20 @@ def cosine_similarities_weekly_lsi(*args, n_components=100, lsi_model=None, **kw
return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs) return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs)
#tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_weekly.parquet') #tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf_weekly/comment_submission_terms_tfidf.parquet')
def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=None,max_df=None): def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=None,max_df=None):
print(outfile) print(outfile)
# do this step in parallel if we have the memory for it. # do this step in parallel if we have the memory for it.
# should be doable with pool.map # should be doable with pool.map
conn = duckdb.connect()
subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df()
spark = SparkSession.builder.getOrCreate() nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df()
df = spark.read.parquet(tfidf_path) nterms = nterms.nterms.values
# load subreddits + topN weeks = conn.execute(f"SELECT DISTINCT week FROM read_parquet('{tfidf_path}/*/*.parquet')").df()
weeks = weeks.week.values
subreddit_names = df.select(['subreddit','subreddit_id']).distinct().toPandas() conn.close()
subreddit_names = subreddit_names.sort_values("subreddit_id")
nterms = df.select(f.max(f.col(term_colname + "_id")).alias('max')).collect()[0].max
weeks = df.select(f.col("week")).distinct().toPandas().week.values
spark.stop()
print(f"computing weekly similarities") print(f"computing weekly similarities")
week_similarities_helper = partial(_week_similarities,simfunc=simfunc, tfidf_path=tfidf_path, term_colname=term_colname, outdir=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=None, subreddit_names=subreddit_names,nterms=nterms) week_similarities_helper = partial(_week_similarities,simfunc=simfunc, tfidf_path=tfidf_path, term_colname=term_colname, outdir=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=None, subreddit_names=subreddit_names,nterms=nterms)