output parquet files in chunks to avoid memory issues with parquet.

2025-12-20 21:45:39 -08:00
parent 6a4bf81e1a
commit 6988a281dc
3 changed files with 254 additions and 36 deletions
--- a/src/wikiq/resume.py
+++ b/src/wikiq/resume.py
@@ -125,12 +125,14 @@ def read_checkpoint(output_file, partition_namespaces=False):
    Read resume point from checkpoint file if it exists.

    Checkpoint format:
-        Single file: {"pageid": 54, "revid": 325}
-        Partitioned: {"0": {"pageid": 54, "revid": 325}, "1": {"pageid": 123, "revid": 456}}
+        Single file: {"pageid": 54, "revid": 325, "part": 2}
+        Partitioned: {"0": {"pageid": 54, "revid": 325, "part": 1}, ...}

    Returns:
-        For single files: A tuple (pageid, revid), or None if not found.
-        For partitioned: A dict mapping namespace -> (pageid, revid), or None.
+        For single files: A tuple (pageid, revid, part), or None if not found.
+        For partitioned: A dict mapping namespace -> (pageid, revid, part), or None.
+
+    Note: part defaults to 0 for checkpoints without part numbers (backwards compat).
    """
    checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
    if not os.path.exists(checkpoint_path):
@@ -143,14 +145,16 @@ def read_checkpoint(output_file, partition_namespaces=False):
        if not data:
            return None

-        # Single-file format: {"pageid": ..., "revid": ...}
+        # Single-file format: {"pageid": ..., "revid": ..., "part": ...}
        if "pageid" in data and "revid" in data:
-            return (data["pageid"], data["revid"])
+            part = data.get("part", 0)
+            return (data["pageid"], data["revid"], part)

-        # Partitioned format: {"0": {"pageid": ..., "revid": ...}, ...}
+        # Partitioned format: {"0": {"pageid": ..., "revid": ..., "part": ...}, ...}
        result = {}
        for key, value in data.items():
-            result[int(key)] = (value["pageid"], value["revid"])
+            part = value.get("part", 0)
+            result[int(key)] = (value["pageid"], value["revid"], part)

        return result if result else None

@@ -173,10 +177,9 @@ def get_resume_point(output_file, partition_namespaces=False):
        partition_namespaces: Whether the output uses namespace partitioning.

    Returns:
-        For single files: A tuple (pageid, revid) for the row with the highest pageid,
-                         or None if not found.
-        For partitioned: A dict mapping namespace -> (pageid, revid) for each partition,
-                        or None if no partitions exist.
+        For single files: A tuple (pageid, revid, part) or None if not found.
+        For partitioned: A dict mapping namespace -> (pageid, revid, part), or None.
+        When falling back to parquet scanning, part defaults to 0.
    """
    # First try checkpoint file (fast)
    checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
@@ -202,6 +205,7 @@ def _get_last_row_resume_point(pq_path):

    Since data is written in page/revision order, the last row group contains
    the highest pageid/revid, and the last row in that group is the resume point.
+    Returns (pageid, revid, part) with part=0 (scanning can't determine part).
    """
    pf = pq.ParquetFile(pq_path)
    if pf.metadata.num_row_groups == 0:
@@ -214,15 +218,15 @@ def _get_last_row_resume_point(pq_path):

    max_pageid = table['articleid'][-1].as_py()
    max_revid = table['revid'][-1].as_py()
-    return (max_pageid, max_revid)
+    return (max_pageid, max_revid, 0)


 def _get_resume_point_partitioned(output_file):
    """Find per-namespace resume points from partitioned output.

    Only looks for the specific output file in each namespace directory.
-    Returns a dict mapping namespace -> (max_pageid, max_revid) for each partition
-    where the output file exists.
+    Returns a dict mapping namespace -> (max_pageid, max_revid, part=0) for each
+    partition where the output file exists.

    Args:
        output_file: Path like 'dir/output.parquet' where namespace=* subdirectories