enable --resuming from interrupted jobs.
This commit is contained in:
@@ -439,3 +439,100 @@ def test_parquet():
|
||||
pytest.fail(exc)
|
||||
|
||||
# assert_frame_equal(test, baseline, check_like=True, check_dtype=False)
|
||||
|
||||
def test_resume():
|
||||
"""Test that --resume properly resumes processing from the last written revid."""
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
# First, create a complete baseline output
|
||||
tester_full = WikiqTester(SAILORMOON, "resume_full", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester_full.call_wikiq("--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
# Read the full output
|
||||
full_output_path = os.path.join(tester_full.output, f"{SAILORMOON}.parquet")
|
||||
full_table = pq.read_table(full_output_path)
|
||||
|
||||
# Get the middle revid to use as the resume point
|
||||
middle_idx = len(full_table) // 2
|
||||
resume_revid = full_table.column("revid")[middle_idx].as_py()
|
||||
|
||||
print(f"Total revisions: {len(full_table)}, Resume point: {middle_idx}, Resume revid: {resume_revid}")
|
||||
|
||||
# Create a partial output by copying row groups to preserve the exact schema
|
||||
tester_partial = WikiqTester(SAILORMOON, "resume_partial", in_compression="7z", out_format="parquet")
|
||||
partial_output_path = os.path.join(tester_partial.output, f"{SAILORMOON}.parquet")
|
||||
|
||||
# Create partial output by filtering the table and writing with the same schema
|
||||
partial_table = full_table.slice(0, middle_idx + 1)
|
||||
pq.write_table(partial_table, partial_output_path)
|
||||
|
||||
# Now resume from the partial output
|
||||
try:
|
||||
tester_partial.call_wikiq("--fandom-2020", "--resume")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
# Read the resumed output
|
||||
resumed_table = pq.read_table(partial_output_path)
|
||||
|
||||
# The resumed output should match the full output
|
||||
# Convert to dataframes for comparison, sorting by revid
|
||||
resumed_df = resumed_table.to_pandas().sort_values("revid").reset_index(drop=True)
|
||||
full_df = full_table.to_pandas().sort_values("revid").reset_index(drop=True)
|
||||
|
||||
# Compare the dataframes
|
||||
assert_frame_equal(resumed_df, full_df, check_like=True, check_dtype=False)
|
||||
|
||||
print(f"Resume test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")
|
||||
|
||||
def test_resume_with_diff():
|
||||
"""Test that --resume works correctly with diff computation."""
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
# First, create a complete baseline output with diff
|
||||
tester_full = WikiqTester(SAILORMOON, "resume_diff_full", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester_full.call_wikiq("--diff", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
# Read the full output
|
||||
full_output_path = os.path.join(tester_full.output, f"{SAILORMOON}.parquet")
|
||||
full_table = pq.read_table(full_output_path)
|
||||
|
||||
# Get a revid about 1/3 through to use as the resume point
|
||||
resume_idx = len(full_table) // 3
|
||||
resume_revid = full_table.column("revid")[resume_idx].as_py()
|
||||
|
||||
print(f"Total revisions: {len(full_table)}, Resume point: {resume_idx}, Resume revid: {resume_revid}")
|
||||
|
||||
# Create a partial output by filtering the table to preserve the exact schema
|
||||
tester_partial = WikiqTester(SAILORMOON, "resume_diff_partial", in_compression="7z", out_format="parquet")
|
||||
partial_output_path = os.path.join(tester_partial.output, f"{SAILORMOON}.parquet")
|
||||
|
||||
# Create partial output by slicing the table
|
||||
partial_table = full_table.slice(0, resume_idx + 1)
|
||||
pq.write_table(partial_table, partial_output_path)
|
||||
|
||||
# Now resume from the partial output
|
||||
try:
|
||||
tester_partial.call_wikiq("--diff", "--fandom-2020", "--resume")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
# Read the resumed output
|
||||
resumed_table = pq.read_table(partial_output_path)
|
||||
|
||||
# Convert to dataframes for comparison, sorting by revid
|
||||
resumed_df = resumed_table.to_pandas().sort_values("revid").reset_index(drop=True)
|
||||
full_df = full_table.to_pandas().sort_values("revid").reset_index(drop=True)
|
||||
|
||||
# Compare the dataframes
|
||||
assert_frame_equal(resumed_df, full_df, check_like=True, check_dtype=False)
|
||||
|
||||
print(f"Resume with diff test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")
|
||||
|
||||
Reference in New Issue
Block a user