18
0
Files
cdsc_reddit/datasets/parquet_part1.py
Benjamin Mako Hill 8965a251b6 refactor datasets/ pipeline; add build/add-month workflows
Replace the four per-type scripts (comments/submissions x part1/part2)
with two merged scripts that share all of their plumbing — only the
schema and JSON parser differ between types. Drop the per-source part
rolling; one parquet per input zst, since Spark handles big parquet
files via internal row groups.

Add two thin runner scripts for the two common workflows:
build_from_scratch.sh wipes the temp dirs and processes everything,
add_new_month.sh takes YYYY-MM and parses just that month before
re-running the Spark sort. Every step in the runners is a separate
command so individual stages can be copied out and run standalone
for debugging.

Also fixes several lurking bugs in the original code: the hardcoded
/gscratch/comdata/users/nathante/ output path in comments Part 2;
the df2 = df.sortWithinPartitions typo in submissions Part 2 that
threw away the preceding global sort; references to a missing
parse_submissions.sh in the old .sh runners; and the asymmetry where
comments_2_parquet_part1.py wasn't per-file/fire-driven the way
submissions_2_parquet_part1.py was.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-25 16:30:54 -07:00

239 lines
8.7 KiB
Python
Executable File

#!/usr/bin/env python3
# Stage 1 of the dump-to-parquet pipeline: read a compressed Reddit dump
# (a single RC_*.zst comment file or RS_*.zst submission file) and write
# the parsed records to a per-source parquet file.
#
# Stage 2 (parquet_part2.py) re-reads the temp directory in Spark and
# produces the sorted, partitioned datasets.
#
# CLI:
# parquet_part1.py comments parse_dump RC_2018-08.zst
# parquet_part1.py comments gen_task_list
# parquet_part1.py submissions parse_dump RS_2018-08.zst
# parquet_part1.py submissions gen_task_list
#
# Override default paths with --dumpdir / --outdir when debugging.
import json
import os
from datetime import datetime
from itertools import islice
import fire
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import simdjson
from helper import find_dumps, open_fileset
# --- comments -------------------------------------------------------------
COMMENT_SCHEMA = pa.schema([
pa.field('id', pa.string(), nullable=True),
pa.field('subreddit', pa.string(), nullable=True),
pa.field('link_id', pa.string(), nullable=True),
pa.field('parent_id', pa.string(), nullable=True),
pa.field('created_utc', pa.timestamp('ms'), nullable=True),
pa.field('author', pa.string(), nullable=True),
pa.field('ups', pa.int64(), nullable=True),
pa.field('downs', pa.int64(), nullable=True),
pa.field('score', pa.int64(), nullable=True),
pa.field('edited', pa.bool_(), nullable=True),
pa.field('time_edited', pa.timestamp('ms'), nullable=True),
pa.field('subreddit_type', pa.string(), nullable=True),
pa.field('subreddit_id', pa.string(), nullable=True),
pa.field('stickied', pa.bool_(), nullable=True),
pa.field('is_submitter', pa.bool_(), nullable=True),
pa.field('body', pa.string(), nullable=True),
pa.field('error', pa.string(), nullable=True),
])
COMMENT_FIELDS = ["id","subreddit","link_id","parent_id","created_utc","author","ups","downs","score","edited","subreddit_type","subreddit_id","stickied","is_submitter","body","error"]
def parse_comment(line):
try:
comment = json.loads(line)
except json.decoder.JSONDecodeError as e:
print(e)
print(line)
row = [None for _ in COMMENT_FIELDS]
row[-1] = f"json.decoder.JSONDecodeError|{e}|{line}"
return tuple(row)
row = []
for name in COMMENT_FIELDS:
if name == 'created_utc':
row.append(datetime.fromtimestamp(int(comment['created_utc']), tz=None))
elif name == 'edited':
val = comment[name]
if type(val) == bool:
row.append(val)
row.append(None)
else:
row.append(True)
row.append(datetime.fromtimestamp(int(val), tz=None))
elif name == "time_edited":
continue
elif name not in comment:
row.append(None)
else:
row.append(comment[name])
return tuple(row)
# --- submissions ----------------------------------------------------------
SUBMISSION_SCHEMA = pa.schema([
pa.field('id', pa.string(), nullable=True),
pa.field('author', pa.string(), nullable=True),
pa.field('subreddit', pa.string(), nullable=True),
pa.field('title', pa.string(), nullable=True),
pa.field('created_utc', pa.timestamp('ms'), nullable=True),
pa.field('permalink', pa.string(), nullable=True),
pa.field('url', pa.string(), nullable=True),
pa.field('domain', pa.string(), nullable=True),
pa.field('score', pa.int64(), nullable=True),
pa.field('ups', pa.int64(), nullable=True),
pa.field('downs', pa.int64(), nullable=True),
pa.field('over_18', pa.bool_(), nullable=True),
pa.field('has_media', pa.bool_(), nullable=True),
pa.field('selftext', pa.string(), nullable=True),
pa.field('retrieved_on', pa.timestamp('ms'), nullable=True),
pa.field('num_comments', pa.int64(), nullable=True),
pa.field('gilded', pa.int64(), nullable=True),
pa.field('edited', pa.bool_(), nullable=True),
pa.field('time_edited', pa.timestamp('ms'), nullable=True),
pa.field('subreddit_type', pa.string(), nullable=True),
pa.field('subreddit_id', pa.string(), nullable=True),
pa.field('subreddit_subscribers', pa.int64(), nullable=True),
pa.field('name', pa.string(), nullable=True),
pa.field('is_self', pa.bool_(), nullable=True),
pa.field('stickied', pa.bool_(), nullable=True),
pa.field('quarantine', pa.bool_(), nullable=True),
pa.field('error', pa.string(), nullable=True),
])
SUBMISSION_FIELDS = ['id','author','subreddit','title','created_utc','permalink','url','domain','score','ups','downs','over_18','has_media','selftext','retrieved_on','num_comments','gilded','edited','time_edited','subreddit_type','subreddit_id','subreddit_subscribers','name','is_self','stickied','quarantine','error']
_simdjson_parser = simdjson.Parser()
def parse_submission(line):
try:
post = _simdjson_parser.parse(line)
except ValueError as e:
row = [None for _ in SUBMISSION_FIELDS]
row[-1] = f"Error parsing json|{e}|{line}"
return tuple(row)
row = []
for name in SUBMISSION_FIELDS:
if name == 'created_utc' or name == 'retrieved_on':
val = post.get(name, None)
if val is not None:
row.append(datetime.fromtimestamp(int(post[name]), tz=None))
else:
row.append(None)
elif name == 'edited':
val = post[name]
if type(val) == bool:
row.append(val)
row.append(None)
else:
row.append(True)
row.append(datetime.fromtimestamp(int(val), tz=None))
elif name == "time_edited":
continue
elif name == 'has_media':
row.append(post.get('media', None) is not None)
elif name not in post:
row.append(None)
else:
row.append(post[name])
return tuple(row)
# --- type registry --------------------------------------------------------
TYPES = {
'comments': {
'schema': COMMENT_SCHEMA,
'parser': parse_comment,
'dumpdir': "/gscratch/comdata/raw_data/reddit_dumps/comments",
'outdir': "/gscratch/comdata/output/temp/reddit_comments.parquet",
'file_pattern': 'RC_20*.*',
'task_list': 'parse_comments_task_list',
'fire_path': 'comments',
},
'submissions': {
'schema': SUBMISSION_SCHEMA,
'parser': parse_submission,
'dumpdir': "/gscratch/comdata/raw_data/reddit_dumps/submissions",
'outdir': "/gscratch/comdata/output/temp/reddit_submissions.parquet",
'file_pattern': 'RS_20*.*',
'task_list': 'parse_submissions_task_list',
'fire_path': 'submissions',
},
}
# --- shared workers -------------------------------------------------------
def _parse_dump(dump_type, partition, dumpdir=None, outdir=None):
config = TYPES[dump_type]
dumpdir = dumpdir or config['dumpdir']
outdir = outdir or config['outdir']
schema = config['schema']
parser = config['parser']
N = 10000
stream = open_fileset([os.path.join(dumpdir, partition)])
rows = map(parser, stream)
os.makedirs(outdir, exist_ok=True)
outfile = os.path.join(outdir, os.path.splitext(partition)[0] + ".parquet")
with pq.ParquetWriter(outfile, schema=schema, compression='snappy', flavor='spark') as writer:
while True:
chunk = list(islice(rows, N))
if len(chunk) == 0:
break
pddf = pd.DataFrame(chunk, columns=schema.names)
table = pa.Table.from_pandas(pddf, schema=schema)
writer.write_table(table)
def _gen_task_list(dump_type, dumpdir=None, tasklist=None):
config = TYPES[dump_type]
dumpdir = dumpdir or config['dumpdir']
tasklist = tasklist or config['task_list']
fire_path = config['fire_path']
files = list(find_dumps(dumpdir, base_pattern=config['file_pattern']))
with open(tasklist, 'w') as of:
for fpath in files:
partition = os.path.split(fpath)[1]
of.write(f'python3 parquet_part1.py {fire_path} parse_dump {partition}\n')
# --- fire CLI -------------------------------------------------------------
class _Subcommand:
def __init__(self, dump_type):
self._dump_type = dump_type
def parse_dump(self, partition, dumpdir=None, outdir=None):
_parse_dump(self._dump_type, partition, dumpdir=dumpdir, outdir=outdir)
def gen_task_list(self, dumpdir=None, tasklist=None):
_gen_task_list(self._dump_type, dumpdir=dumpdir, tasklist=tasklist)
if __name__ == "__main__":
fire.Fire({'comments': _Subcommand('comments'),
'submissions': _Subcommand('submissions')})