output parquet files in chunks to avoid memory issues with parquet.
This commit is contained in:
@@ -369,7 +369,7 @@ def test_cleanup_interrupted_resume_original_corrupted_temp_valid():
|
||||
|
||||
resume_point = get_resume_point(output_file, partition_namespaces=False)
|
||||
assert resume_point is not None, "Should find resume point from recovered file"
|
||||
assert resume_point == (30, 300), f"Expected (30, 300), got {resume_point}"
|
||||
assert resume_point == (30, 300, 0), f"Expected (30, 300, 0), got {resume_point}"
|
||||
|
||||
print("Cleanup with original corrupted, temp valid test passed!")
|
||||
|
||||
@@ -396,7 +396,7 @@ def test_cleanup_original_missing_temp_valid_no_checkpoint():
|
||||
|
||||
resume_point = get_resume_point(output_file, partition_namespaces=False)
|
||||
assert resume_point is not None, "Should find resume point from recovered file"
|
||||
assert resume_point == (30, 300), f"Expected (30, 300), got {resume_point}"
|
||||
assert resume_point == (30, 300, 0), f"Expected (30, 300, 0), got {resume_point}"
|
||||
|
||||
print("Original missing, temp valid, no checkpoint test passed!")
|
||||
|
||||
@@ -464,3 +464,86 @@ def test_concurrent_jobs_different_input_files():
|
||||
assert orig2_ns1.num_rows == 2, "file2 ns1 should still have 2 rows"
|
||||
|
||||
print("Concurrent jobs with different input files test passed!")
|
||||
|
||||
|
||||
def test_max_revisions_per_file_creates_parts():
|
||||
"""Test that --max-revisions-per-file creates multiple part files."""
|
||||
import re
|
||||
tester = WikiqTester(SAILORMOON, "max_revs_parts", in_compression="7z", out_format="parquet")
|
||||
|
||||
max_revs = 50
|
||||
try:
|
||||
# Use a very small limit to force multiple parts
|
||||
tester.call_wikiq("--fandom-2020", "--max-revisions-per-file", str(max_revs))
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
output_dir = tester.output
|
||||
all_parquet = [f for f in os.listdir(output_dir) if f.endswith(".parquet") and ".part" in f]
|
||||
|
||||
# Sort by part number numerically
|
||||
def get_part_num(filename):
|
||||
match = re.search(r'\.part(\d+)\.parquet$', filename)
|
||||
return int(match.group(1)) if match else 0
|
||||
|
||||
part_files = sorted(all_parquet, key=get_part_num)
|
||||
|
||||
assert len(part_files) > 1, f"Expected multiple part files, got {part_files}"
|
||||
|
||||
# Read all parts and verify total rows
|
||||
total_rows = 0
|
||||
for f in part_files:
|
||||
table = pq.read_table(os.path.join(output_dir, f))
|
||||
total_rows += len(table)
|
||||
|
||||
assert total_rows > 0, "Should have some rows across all parts"
|
||||
|
||||
# Each part (except the last) should have at least max_revisions rows
|
||||
# (rotation happens after the batch that hits the limit is written)
|
||||
for f in part_files[:-1]:
|
||||
table = pq.read_table(os.path.join(output_dir, f))
|
||||
assert len(table) >= max_revs, f"Part file {f} should have at least {max_revs} rows, got {len(table)}"
|
||||
|
||||
print(f"max-revisions-per-file test passed! Created {len(part_files)} parts with {total_rows} total rows")
|
||||
|
||||
|
||||
def test_max_revisions_per_file_with_partitioned():
|
||||
"""Test that --max-revisions-per-file works with partitioned namespace output."""
|
||||
import re
|
||||
tester = WikiqTester(SAILORMOON, "max_revs_partitioned", in_compression="7z", out_format="parquet")
|
||||
|
||||
max_revs = 20
|
||||
try:
|
||||
# Use a small limit to force parts, with partitioned output
|
||||
tester.call_wikiq("--fandom-2020", "--partition-namespaces", "--max-revisions-per-file", str(max_revs))
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
output_dir = tester.output
|
||||
|
||||
# Find namespace directories
|
||||
ns_dirs = [d for d in os.listdir(output_dir) if d.startswith("namespace=")]
|
||||
assert len(ns_dirs) > 0, "Should have namespace directories"
|
||||
|
||||
def get_part_num(filename):
|
||||
match = re.search(r'\.part(\d+)\.parquet$', filename)
|
||||
return int(match.group(1)) if match else 0
|
||||
|
||||
# Check that at least one namespace has multiple parts
|
||||
found_multi_part = False
|
||||
for ns_dir in ns_dirs:
|
||||
ns_path = os.path.join(output_dir, ns_dir)
|
||||
parquet_files = [f for f in os.listdir(ns_path) if f.endswith(".parquet")]
|
||||
part_files = [f for f in parquet_files if ".part" in f]
|
||||
if len(part_files) > 1:
|
||||
found_multi_part = True
|
||||
# Sort by part number and verify each part (except last) has at least limit rows
|
||||
sorted_parts = sorted(part_files, key=get_part_num)
|
||||
for f in sorted_parts[:-1]:
|
||||
pf = pq.ParquetFile(os.path.join(ns_path, f))
|
||||
num_rows = pf.metadata.num_rows
|
||||
assert num_rows >= max_revs, f"Part file {f} in {ns_dir} should have at least {max_revs} rows, got {num_rows}"
|
||||
|
||||
assert found_multi_part, "At least one namespace should have multiple part files"
|
||||
|
||||
print(f"max-revisions-per-file with partitioned output test passed!")
|
||||
|
||||
Reference in New Issue
Block a user