enable --resuming from interrupted jobs.

This commit is contained in:
Nathan TeBlunthuis
2025-11-30 20:36:31 -08:00
parent 95b33123e3
commit 3c26185739
2 changed files with 265 additions and 4 deletions

View File

@@ -439,3 +439,100 @@ def test_parquet():
pytest.fail(exc)
# assert_frame_equal(test, baseline, check_like=True, check_dtype=False)
def test_resume():
"""Test that --resume properly resumes processing from the last written revid."""
import pyarrow.parquet as pq
# First, create a complete baseline output
tester_full = WikiqTester(SAILORMOON, "resume_full", in_compression="7z", out_format="parquet")
try:
tester_full.call_wikiq("--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
# Read the full output
full_output_path = os.path.join(tester_full.output, f"{SAILORMOON}.parquet")
full_table = pq.read_table(full_output_path)
# Get the middle revid to use as the resume point
middle_idx = len(full_table) // 2
resume_revid = full_table.column("revid")[middle_idx].as_py()
print(f"Total revisions: {len(full_table)}, Resume point: {middle_idx}, Resume revid: {resume_revid}")
# Create a partial output by copying row groups to preserve the exact schema
tester_partial = WikiqTester(SAILORMOON, "resume_partial", in_compression="7z", out_format="parquet")
partial_output_path = os.path.join(tester_partial.output, f"{SAILORMOON}.parquet")
# Create partial output by filtering the table and writing with the same schema
partial_table = full_table.slice(0, middle_idx + 1)
pq.write_table(partial_table, partial_output_path)
# Now resume from the partial output
try:
tester_partial.call_wikiq("--fandom-2020", "--resume")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
# Read the resumed output
resumed_table = pq.read_table(partial_output_path)
# The resumed output should match the full output
# Convert to dataframes for comparison, sorting by revid
resumed_df = resumed_table.to_pandas().sort_values("revid").reset_index(drop=True)
full_df = full_table.to_pandas().sort_values("revid").reset_index(drop=True)
# Compare the dataframes
assert_frame_equal(resumed_df, full_df, check_like=True, check_dtype=False)
print(f"Resume test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")
def test_resume_with_diff():
"""Test that --resume works correctly with diff computation."""
import pyarrow.parquet as pq
# First, create a complete baseline output with diff
tester_full = WikiqTester(SAILORMOON, "resume_diff_full", in_compression="7z", out_format="parquet")
try:
tester_full.call_wikiq("--diff", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
# Read the full output
full_output_path = os.path.join(tester_full.output, f"{SAILORMOON}.parquet")
full_table = pq.read_table(full_output_path)
# Get a revid about 1/3 through to use as the resume point
resume_idx = len(full_table) // 3
resume_revid = full_table.column("revid")[resume_idx].as_py()
print(f"Total revisions: {len(full_table)}, Resume point: {resume_idx}, Resume revid: {resume_revid}")
# Create a partial output by filtering the table to preserve the exact schema
tester_partial = WikiqTester(SAILORMOON, "resume_diff_partial", in_compression="7z", out_format="parquet")
partial_output_path = os.path.join(tester_partial.output, f"{SAILORMOON}.parquet")
# Create partial output by slicing the table
partial_table = full_table.slice(0, resume_idx + 1)
pq.write_table(partial_table, partial_output_path)
# Now resume from the partial output
try:
tester_partial.call_wikiq("--diff", "--fandom-2020", "--resume")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
# Read the resumed output
resumed_table = pq.read_table(partial_output_path)
# Convert to dataframes for comparison, sorting by revid
resumed_df = resumed_table.to_pandas().sort_values("revid").reset_index(drop=True)
full_df = full_table.to_pandas().sort_values("revid").reset_index(drop=True)
# Compare the dataframes
assert_frame_equal(resumed_df, full_df, check_like=True, check_dtype=False)
print(f"Resume with diff test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")