make --resume work with partitioned namespaces.
This commit is contained in:
@@ -7,6 +7,7 @@ from typing import Final, Union
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
from pandas import DataFrame
|
||||
from pandas.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
@@ -536,3 +537,96 @@ def test_resume_with_diff():
|
||||
assert_frame_equal(resumed_df, full_df, check_like=True, check_dtype=False)
|
||||
|
||||
print(f"Resume with diff test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")
|
||||
|
||||
def test_resume_with_partition_namespaces():
|
||||
"""Test that --resume works correctly with --partition-namespaces."""
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
# First, create a complete baseline output with partition-namespaces
|
||||
tester_full = WikiqTester(SAILORMOON, "resume_partition_full", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester_full.call_wikiq("--partition-namespaces", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
# Read the full output from the partitioned directory
|
||||
full_output_dir = tester_full.output
|
||||
namespace_dirs = [d for d in os.listdir(full_output_dir) if d.startswith('namespace=')]
|
||||
|
||||
if not namespace_dirs:
|
||||
pytest.fail("No namespace directories found in output")
|
||||
|
||||
# Collect all revisions from all namespaces
|
||||
full_revids = []
|
||||
for ns_dir in sorted(namespace_dirs):
|
||||
parquet_files = [f for f in os.listdir(os.path.join(full_output_dir, ns_dir)) if f.endswith('.parquet')]
|
||||
if parquet_files:
|
||||
ns_parquet_path = os.path.join(full_output_dir, ns_dir, parquet_files[0])
|
||||
pf = pq.ParquetFile(ns_parquet_path)
|
||||
table = pf.read(columns=['revid'])
|
||||
revids = table.column('revid').to_pylist()
|
||||
full_revids.extend(revids)
|
||||
|
||||
full_revids_sorted = sorted(set(full_revids))
|
||||
total_revisions = len(full_revids_sorted)
|
||||
|
||||
# Get a revid about 1/3 through to use as the resume point
|
||||
resume_idx = total_revisions // 3
|
||||
resume_revid = full_revids_sorted[resume_idx]
|
||||
|
||||
print(f"Total revisions: {total_revisions}, Resume point: {resume_idx}, Resume revid: {resume_revid}")
|
||||
|
||||
# Create a partial output by manually creating the partitioned structure
|
||||
tester_partial = WikiqTester(SAILORMOON, "resume_partition_partial", in_compression="7z", out_format="parquet")
|
||||
partial_output_dir = tester_partial.output
|
||||
|
||||
# Copy the full partitioned output to the partial directory
|
||||
for ns_dir in namespace_dirs:
|
||||
src_ns_path = os.path.join(full_output_dir, ns_dir)
|
||||
dst_ns_path = os.path.join(partial_output_dir, ns_dir)
|
||||
shutil.copytree(src_ns_path, dst_ns_path)
|
||||
|
||||
# Now filter each namespace file to only include revisions up to resume_idx
|
||||
revised_data_count = 0
|
||||
for ns_dir in namespace_dirs:
|
||||
parquet_files = [f for f in os.listdir(os.path.join(partial_output_dir, ns_dir)) if f.endswith('.parquet')]
|
||||
if parquet_files:
|
||||
ns_parquet_path = os.path.join(partial_output_dir, ns_dir, parquet_files[0])
|
||||
pf = pq.ParquetFile(ns_parquet_path)
|
||||
table = pf.read()
|
||||
|
||||
# Filter to only rows up to the resume point
|
||||
revids = table.column('revid').to_pylist()
|
||||
mask = pa.array([revid <= resume_revid for revid in revids], type=pa.bool_())
|
||||
partial_table = table.filter(mask)
|
||||
revised_data_count += len(partial_table)
|
||||
|
||||
# Write back the filtered data
|
||||
pq.write_table(partial_table, ns_parquet_path)
|
||||
|
||||
print(f"Created partial output with {revised_data_count} revisions (up to revid {resume_revid})")
|
||||
|
||||
# Now resume from the partial output
|
||||
try:
|
||||
tester_partial.call_wikiq("--partition-namespaces", "--fandom-2020", "--resume")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
# Read the resumed output and collect revids
|
||||
resumed_revids = []
|
||||
for ns_dir in namespace_dirs:
|
||||
parquet_files = [f for f in os.listdir(os.path.join(partial_output_dir, ns_dir)) if f.endswith('.parquet')]
|
||||
if parquet_files:
|
||||
ns_parquet_path = os.path.join(partial_output_dir, ns_dir, parquet_files[0])
|
||||
pf = pq.ParquetFile(ns_parquet_path)
|
||||
table = pf.read(columns=['revid'])
|
||||
revids = table.column('revid').to_pylist()
|
||||
resumed_revids.extend(revids)
|
||||
|
||||
resumed_revids_sorted = sorted(set(resumed_revids))
|
||||
|
||||
# Compare the revids
|
||||
assert resumed_revids_sorted == full_revids_sorted, f"Resumed revids mismatch: {len(resumed_revids_sorted)} vs {len(full_revids_sorted)}"
|
||||
|
||||
print(f"Resume with partition-namespaces test passed! Original: {len(full_revids_sorted)} revisions, Resumed: {len(resumed_revids_sorted)} revisions")
|
||||
|
||||
Reference in New Issue
Block a user