allow authors to be null in submissions.
This commit is contained in:
parent
a5ca25dd6e
commit
22d6a6961c
@ -119,9 +119,10 @@ def weekly_tf(partition,
|
||||
|
||||
if reddit_dataset == 'comments':
|
||||
tf_func = tf_comments
|
||||
nullable_schema = False
|
||||
elif reddit_dataset == 'posts':
|
||||
tf_func = tf_posts
|
||||
|
||||
nullable_schema = True
|
||||
dataset = ds.dataset(f"{input_parquet}/{partition}", format='parquet')
|
||||
if not os.path.exists(output_10p_sample_path):
|
||||
os.mkdir(output_10p_sample_path)
|
||||
@ -140,16 +141,16 @@ def weekly_tf(partition,
|
||||
if reddit_dataset == 'posts':
|
||||
batches = dataset.to_batches(columns=['CreatedAt','subreddit','title','author'])
|
||||
|
||||
schema = pa.schema([pa.field('subreddit', pa.string(), nullable=False),
|
||||
pa.field('term', pa.string(), nullable=False),
|
||||
pa.field('week', pa.date32(), nullable=False),
|
||||
pa.field('tf', pa.int64(), nullable=False)]
|
||||
schema = pa.schema([pa.field('subreddit', pa.string(), nullable=nullable_schema),
|
||||
pa.field('term', pa.string(), nullable=nullable_schema),
|
||||
pa.field('week', pa.date32(), nullable=nullable_schema),
|
||||
pa.field('tf', pa.int64(), nullable=nullable_schema)]
|
||||
)
|
||||
|
||||
author_schema = pa.schema([pa.field('subreddit', pa.string(), nullable=False),
|
||||
pa.field('author', pa.string(), nullable=False),
|
||||
pa.field('week', pa.date32(), nullable=False),
|
||||
pa.field('tf', pa.int64(), nullable=False)]
|
||||
author_schema = pa.schema([pa.field('subreddit', pa.string(), nullable=nullable_schema),
|
||||
pa.field('author', pa.string(), nullable=nullable_schema),
|
||||
pa.field('week', pa.date32(), nullable=nullable_schema),
|
||||
pa.field('tf', pa.int64(), nullable=nullable_schema)]
|
||||
)
|
||||
|
||||
dfs = (b.to_pandas() for b in batches)
|
||||
|
Loading…
Reference in New Issue
Block a user