output parquet files in chunks to avoid memory issues with parquet.

2025-12-20 21:45:39 -08:00
parent 6a4bf81e1a
commit 6988a281dc
3 changed files with 254 additions and 36 deletions
--- a/test/test_resume.py
+++ b/test/test_resume.py
@@ -369,7 +369,7 @@ def test_cleanup_interrupted_resume_original_corrupted_temp_valid():

        resume_point = get_resume_point(output_file, partition_namespaces=False)
        assert resume_point is not None, "Should find resume point from recovered file"
-        assert resume_point == (30, 300), f"Expected (30, 300), got {resume_point}"
+        assert resume_point == (30, 300, 0), f"Expected (30, 300, 0), got {resume_point}"

    print("Cleanup with original corrupted, temp valid test passed!")

@@ -396,7 +396,7 @@ def test_cleanup_original_missing_temp_valid_no_checkpoint():

        resume_point = get_resume_point(output_file, partition_namespaces=False)
        assert resume_point is not None, "Should find resume point from recovered file"
-        assert resume_point == (30, 300), f"Expected (30, 300), got {resume_point}"
+        assert resume_point == (30, 300, 0), f"Expected (30, 300, 0), got {resume_point}"

    print("Original missing, temp valid, no checkpoint test passed!")

@@ -464,3 +464,86 @@ def test_concurrent_jobs_different_input_files():
        assert orig2_ns1.num_rows == 2, "file2 ns1 should still have 2 rows"

    print("Concurrent jobs with different input files test passed!")
+
+
+def test_max_revisions_per_file_creates_parts():
+    """Test that --max-revisions-per-file creates multiple part files."""
+    import re
+    tester = WikiqTester(SAILORMOON, "max_revs_parts", in_compression="7z", out_format="parquet")
+
+    max_revs = 50
+    try:
+        # Use a very small limit to force multiple parts
+        tester.call_wikiq("--fandom-2020", "--max-revisions-per-file", str(max_revs))
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    output_dir = tester.output
+    all_parquet = [f for f in os.listdir(output_dir) if f.endswith(".parquet") and ".part" in f]
+
+    # Sort by part number numerically
+    def get_part_num(filename):
+        match = re.search(r'\.part(\d+)\.parquet$', filename)
+        return int(match.group(1)) if match else 0
+
+    part_files = sorted(all_parquet, key=get_part_num)
+
+    assert len(part_files) > 1, f"Expected multiple part files, got {part_files}"
+
+    # Read all parts and verify total rows
+    total_rows = 0
+    for f in part_files:
+        table = pq.read_table(os.path.join(output_dir, f))
+        total_rows += len(table)
+
+    assert total_rows > 0, "Should have some rows across all parts"
+
+    # Each part (except the last) should have at least max_revisions rows
+    # (rotation happens after the batch that hits the limit is written)
+    for f in part_files[:-1]:
+        table = pq.read_table(os.path.join(output_dir, f))
+        assert len(table) >= max_revs, f"Part file {f} should have at least {max_revs} rows, got {len(table)}"
+
+    print(f"max-revisions-per-file test passed! Created {len(part_files)} parts with {total_rows} total rows")
+
+
+def test_max_revisions_per_file_with_partitioned():
+    """Test that --max-revisions-per-file works with partitioned namespace output."""
+    import re
+    tester = WikiqTester(SAILORMOON, "max_revs_partitioned", in_compression="7z", out_format="parquet")
+
+    max_revs = 20
+    try:
+        # Use a small limit to force parts, with partitioned output
+        tester.call_wikiq("--fandom-2020", "--partition-namespaces", "--max-revisions-per-file", str(max_revs))
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    output_dir = tester.output
+
+    # Find namespace directories
+    ns_dirs = [d for d in os.listdir(output_dir) if d.startswith("namespace=")]
+    assert len(ns_dirs) > 0, "Should have namespace directories"
+
+    def get_part_num(filename):
+        match = re.search(r'\.part(\d+)\.parquet$', filename)
+        return int(match.group(1)) if match else 0
+
+    # Check that at least one namespace has multiple parts
+    found_multi_part = False
+    for ns_dir in ns_dirs:
+        ns_path = os.path.join(output_dir, ns_dir)
+        parquet_files = [f for f in os.listdir(ns_path) if f.endswith(".parquet")]
+        part_files = [f for f in parquet_files if ".part" in f]
+        if len(part_files) > 1:
+            found_multi_part = True
+            # Sort by part number and verify each part (except last) has at least limit rows
+            sorted_parts = sorted(part_files, key=get_part_num)
+            for f in sorted_parts[:-1]:
+                pf = pq.ParquetFile(os.path.join(ns_path, f))
+                num_rows = pf.metadata.num_rows
+                assert num_rows >= max_revs, f"Part file {f} in {ns_dir} should have at least {max_revs} rows, got {num_rows}"
+
+    assert found_multi_part, "At least one namespace should have multiple part files"
+
+    print(f"max-revisions-per-file with partitioned output test passed!")