output parquet files in chunks to avoid memory issues with parquet.
This commit is contained in:
@@ -125,12 +125,14 @@ def read_checkpoint(output_file, partition_namespaces=False):
|
||||
Read resume point from checkpoint file if it exists.
|
||||
|
||||
Checkpoint format:
|
||||
Single file: {"pageid": 54, "revid": 325}
|
||||
Partitioned: {"0": {"pageid": 54, "revid": 325}, "1": {"pageid": 123, "revid": 456}}
|
||||
Single file: {"pageid": 54, "revid": 325, "part": 2}
|
||||
Partitioned: {"0": {"pageid": 54, "revid": 325, "part": 1}, ...}
|
||||
|
||||
Returns:
|
||||
For single files: A tuple (pageid, revid), or None if not found.
|
||||
For partitioned: A dict mapping namespace -> (pageid, revid), or None.
|
||||
For single files: A tuple (pageid, revid, part), or None if not found.
|
||||
For partitioned: A dict mapping namespace -> (pageid, revid, part), or None.
|
||||
|
||||
Note: part defaults to 0 for checkpoints without part numbers (backwards compat).
|
||||
"""
|
||||
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
|
||||
if not os.path.exists(checkpoint_path):
|
||||
@@ -143,14 +145,16 @@ def read_checkpoint(output_file, partition_namespaces=False):
|
||||
if not data:
|
||||
return None
|
||||
|
||||
# Single-file format: {"pageid": ..., "revid": ...}
|
||||
# Single-file format: {"pageid": ..., "revid": ..., "part": ...}
|
||||
if "pageid" in data and "revid" in data:
|
||||
return (data["pageid"], data["revid"])
|
||||
part = data.get("part", 0)
|
||||
return (data["pageid"], data["revid"], part)
|
||||
|
||||
# Partitioned format: {"0": {"pageid": ..., "revid": ...}, ...}
|
||||
# Partitioned format: {"0": {"pageid": ..., "revid": ..., "part": ...}, ...}
|
||||
result = {}
|
||||
for key, value in data.items():
|
||||
result[int(key)] = (value["pageid"], value["revid"])
|
||||
part = value.get("part", 0)
|
||||
result[int(key)] = (value["pageid"], value["revid"], part)
|
||||
|
||||
return result if result else None
|
||||
|
||||
@@ -173,10 +177,9 @@ def get_resume_point(output_file, partition_namespaces=False):
|
||||
partition_namespaces: Whether the output uses namespace partitioning.
|
||||
|
||||
Returns:
|
||||
For single files: A tuple (pageid, revid) for the row with the highest pageid,
|
||||
or None if not found.
|
||||
For partitioned: A dict mapping namespace -> (pageid, revid) for each partition,
|
||||
or None if no partitions exist.
|
||||
For single files: A tuple (pageid, revid, part) or None if not found.
|
||||
For partitioned: A dict mapping namespace -> (pageid, revid, part), or None.
|
||||
When falling back to parquet scanning, part defaults to 0.
|
||||
"""
|
||||
# First try checkpoint file (fast)
|
||||
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
|
||||
@@ -202,6 +205,7 @@ def _get_last_row_resume_point(pq_path):
|
||||
|
||||
Since data is written in page/revision order, the last row group contains
|
||||
the highest pageid/revid, and the last row in that group is the resume point.
|
||||
Returns (pageid, revid, part) with part=0 (scanning can't determine part).
|
||||
"""
|
||||
pf = pq.ParquetFile(pq_path)
|
||||
if pf.metadata.num_row_groups == 0:
|
||||
@@ -214,15 +218,15 @@ def _get_last_row_resume_point(pq_path):
|
||||
|
||||
max_pageid = table['articleid'][-1].as_py()
|
||||
max_revid = table['revid'][-1].as_py()
|
||||
return (max_pageid, max_revid)
|
||||
return (max_pageid, max_revid, 0)
|
||||
|
||||
|
||||
def _get_resume_point_partitioned(output_file):
|
||||
"""Find per-namespace resume points from partitioned output.
|
||||
|
||||
Only looks for the specific output file in each namespace directory.
|
||||
Returns a dict mapping namespace -> (max_pageid, max_revid) for each partition
|
||||
where the output file exists.
|
||||
Returns a dict mapping namespace -> (max_pageid, max_revid, part=0) for each
|
||||
partition where the output file exists.
|
||||
|
||||
Args:
|
||||
output_file: Path like 'dir/output.parquet' where namespace=* subdirectories
|
||||
|
||||
Reference in New Issue
Block a user