output parquet files in chunks to avoid memory issues with parquet.

This commit is contained in:
Nathan TeBlunthuis
2025-12-20 21:45:39 -08:00
parent 6a4bf81e1a
commit 6988a281dc
3 changed files with 254 additions and 36 deletions

View File

@@ -125,12 +125,14 @@ def read_checkpoint(output_file, partition_namespaces=False):
Read resume point from checkpoint file if it exists.
Checkpoint format:
Single file: {"pageid": 54, "revid": 325}
Partitioned: {"0": {"pageid": 54, "revid": 325}, "1": {"pageid": 123, "revid": 456}}
Single file: {"pageid": 54, "revid": 325, "part": 2}
Partitioned: {"0": {"pageid": 54, "revid": 325, "part": 1}, ...}
Returns:
For single files: A tuple (pageid, revid), or None if not found.
For partitioned: A dict mapping namespace -> (pageid, revid), or None.
For single files: A tuple (pageid, revid, part), or None if not found.
For partitioned: A dict mapping namespace -> (pageid, revid, part), or None.
Note: part defaults to 0 for checkpoints without part numbers (backwards compat).
"""
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
if not os.path.exists(checkpoint_path):
@@ -143,14 +145,16 @@ def read_checkpoint(output_file, partition_namespaces=False):
if not data:
return None
# Single-file format: {"pageid": ..., "revid": ...}
# Single-file format: {"pageid": ..., "revid": ..., "part": ...}
if "pageid" in data and "revid" in data:
return (data["pageid"], data["revid"])
part = data.get("part", 0)
return (data["pageid"], data["revid"], part)
# Partitioned format: {"0": {"pageid": ..., "revid": ...}, ...}
# Partitioned format: {"0": {"pageid": ..., "revid": ..., "part": ...}, ...}
result = {}
for key, value in data.items():
result[int(key)] = (value["pageid"], value["revid"])
part = value.get("part", 0)
result[int(key)] = (value["pageid"], value["revid"], part)
return result if result else None
@@ -173,10 +177,9 @@ def get_resume_point(output_file, partition_namespaces=False):
partition_namespaces: Whether the output uses namespace partitioning.
Returns:
For single files: A tuple (pageid, revid) for the row with the highest pageid,
or None if not found.
For partitioned: A dict mapping namespace -> (pageid, revid) for each partition,
or None if no partitions exist.
For single files: A tuple (pageid, revid, part) or None if not found.
For partitioned: A dict mapping namespace -> (pageid, revid, part), or None.
When falling back to parquet scanning, part defaults to 0.
"""
# First try checkpoint file (fast)
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
@@ -202,6 +205,7 @@ def _get_last_row_resume_point(pq_path):
Since data is written in page/revision order, the last row group contains
the highest pageid/revid, and the last row in that group is the resume point.
Returns (pageid, revid, part) with part=0 (scanning can't determine part).
"""
pf = pq.ParquetFile(pq_path)
if pf.metadata.num_row_groups == 0:
@@ -214,15 +218,15 @@ def _get_last_row_resume_point(pq_path):
max_pageid = table['articleid'][-1].as_py()
max_revid = table['revid'][-1].as_py()
return (max_pageid, max_revid)
return (max_pageid, max_revid, 0)
def _get_resume_point_partitioned(output_file):
"""Find per-namespace resume points from partitioned output.
Only looks for the specific output file in each namespace directory.
Returns a dict mapping namespace -> (max_pageid, max_revid) for each partition
where the output file exists.
Returns a dict mapping namespace -> (max_pageid, max_revid, part=0) for each
partition where the output file exists.
Args:
output_file: Path like 'dir/output.parquet' where namespace=* subdirectories