output parquet files in chunks to avoid memory issues with parquet.

This commit is contained in:
Nathan TeBlunthuis
2025-12-20 21:45:39 -08:00
parent 6a4bf81e1a
commit 6988a281dc
3 changed files with 254 additions and 36 deletions

View File

@@ -369,7 +369,7 @@ def test_cleanup_interrupted_resume_original_corrupted_temp_valid():
resume_point = get_resume_point(output_file, partition_namespaces=False)
assert resume_point is not None, "Should find resume point from recovered file"
assert resume_point == (30, 300), f"Expected (30, 300), got {resume_point}"
assert resume_point == (30, 300, 0), f"Expected (30, 300, 0), got {resume_point}"
print("Cleanup with original corrupted, temp valid test passed!")
@@ -396,7 +396,7 @@ def test_cleanup_original_missing_temp_valid_no_checkpoint():
resume_point = get_resume_point(output_file, partition_namespaces=False)
assert resume_point is not None, "Should find resume point from recovered file"
assert resume_point == (30, 300), f"Expected (30, 300), got {resume_point}"
assert resume_point == (30, 300, 0), f"Expected (30, 300, 0), got {resume_point}"
print("Original missing, temp valid, no checkpoint test passed!")
@@ -464,3 +464,86 @@ def test_concurrent_jobs_different_input_files():
assert orig2_ns1.num_rows == 2, "file2 ns1 should still have 2 rows"
print("Concurrent jobs with different input files test passed!")
def test_max_revisions_per_file_creates_parts():
"""Test that --max-revisions-per-file creates multiple part files."""
import re
tester = WikiqTester(SAILORMOON, "max_revs_parts", in_compression="7z", out_format="parquet")
max_revs = 50
try:
# Use a very small limit to force multiple parts
tester.call_wikiq("--fandom-2020", "--max-revisions-per-file", str(max_revs))
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
output_dir = tester.output
all_parquet = [f for f in os.listdir(output_dir) if f.endswith(".parquet") and ".part" in f]
# Sort by part number numerically
def get_part_num(filename):
match = re.search(r'\.part(\d+)\.parquet$', filename)
return int(match.group(1)) if match else 0
part_files = sorted(all_parquet, key=get_part_num)
assert len(part_files) > 1, f"Expected multiple part files, got {part_files}"
# Read all parts and verify total rows
total_rows = 0
for f in part_files:
table = pq.read_table(os.path.join(output_dir, f))
total_rows += len(table)
assert total_rows > 0, "Should have some rows across all parts"
# Each part (except the last) should have at least max_revisions rows
# (rotation happens after the batch that hits the limit is written)
for f in part_files[:-1]:
table = pq.read_table(os.path.join(output_dir, f))
assert len(table) >= max_revs, f"Part file {f} should have at least {max_revs} rows, got {len(table)}"
print(f"max-revisions-per-file test passed! Created {len(part_files)} parts with {total_rows} total rows")
def test_max_revisions_per_file_with_partitioned():
"""Test that --max-revisions-per-file works with partitioned namespace output."""
import re
tester = WikiqTester(SAILORMOON, "max_revs_partitioned", in_compression="7z", out_format="parquet")
max_revs = 20
try:
# Use a small limit to force parts, with partitioned output
tester.call_wikiq("--fandom-2020", "--partition-namespaces", "--max-revisions-per-file", str(max_revs))
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
output_dir = tester.output
# Find namespace directories
ns_dirs = [d for d in os.listdir(output_dir) if d.startswith("namespace=")]
assert len(ns_dirs) > 0, "Should have namespace directories"
def get_part_num(filename):
match = re.search(r'\.part(\d+)\.parquet$', filename)
return int(match.group(1)) if match else 0
# Check that at least one namespace has multiple parts
found_multi_part = False
for ns_dir in ns_dirs:
ns_path = os.path.join(output_dir, ns_dir)
parquet_files = [f for f in os.listdir(ns_path) if f.endswith(".parquet")]
part_files = [f for f in parquet_files if ".part" in f]
if len(part_files) > 1:
found_multi_part = True
# Sort by part number and verify each part (except last) has at least limit rows
sorted_parts = sorted(part_files, key=get_part_num)
for f in sorted_parts[:-1]:
pf = pq.ParquetFile(os.path.join(ns_path, f))
num_rows = pf.metadata.num_rows
assert num_rows >= max_revs, f"Part file {f} in {ns_dir} should have at least {max_revs} rows, got {num_rows}"
assert found_multi_part, "At least one namespace should have multiple part files"
print(f"max-revisions-per-file with partitioned output test passed!")