enable --resuming from interrupted jobs.

2025-11-30 20:36:31 -08:00
parent 95b33123e3
commit 3c26185739
2 changed files with 265 additions and 4 deletions
--- a/test/Wikiq_Unit_Test.py
+++ b/test/Wikiq_Unit_Test.py
@@ -439,3 +439,100 @@ def test_parquet():
            pytest.fail(exc)

    # assert_frame_equal(test, baseline, check_like=True, check_dtype=False)
+
+def test_resume():
+    """Test that --resume properly resumes processing from the last written revid."""
+    import pyarrow.parquet as pq
+
+    # First, create a complete baseline output
+    tester_full = WikiqTester(SAILORMOON, "resume_full", in_compression="7z", out_format="parquet")
+
+    try:
+        tester_full.call_wikiq("--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    # Read the full output
+    full_output_path = os.path.join(tester_full.output, f"{SAILORMOON}.parquet")
+    full_table = pq.read_table(full_output_path)
+
+    # Get the middle revid to use as the resume point
+    middle_idx = len(full_table) // 2
+    resume_revid = full_table.column("revid")[middle_idx].as_py()
+
+    print(f"Total revisions: {len(full_table)}, Resume point: {middle_idx}, Resume revid: {resume_revid}")
+
+    # Create a partial output by copying row groups to preserve the exact schema
+    tester_partial = WikiqTester(SAILORMOON, "resume_partial", in_compression="7z", out_format="parquet")
+    partial_output_path = os.path.join(tester_partial.output, f"{SAILORMOON}.parquet")
+
+    # Create partial output by filtering the table and writing with the same schema
+    partial_table = full_table.slice(0, middle_idx + 1)
+    pq.write_table(partial_table, partial_output_path)
+
+    # Now resume from the partial output
+    try:
+        tester_partial.call_wikiq("--fandom-2020", "--resume")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    # Read the resumed output
+    resumed_table = pq.read_table(partial_output_path)
+
+    # The resumed output should match the full output
+    # Convert to dataframes for comparison, sorting by revid
+    resumed_df = resumed_table.to_pandas().sort_values("revid").reset_index(drop=True)
+    full_df = full_table.to_pandas().sort_values("revid").reset_index(drop=True)
+
+    # Compare the dataframes
+    assert_frame_equal(resumed_df, full_df, check_like=True, check_dtype=False)
+
+    print(f"Resume test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")
+
+def test_resume_with_diff():
+    """Test that --resume works correctly with diff computation."""
+    import pyarrow.parquet as pq
+
+    # First, create a complete baseline output with diff
+    tester_full = WikiqTester(SAILORMOON, "resume_diff_full", in_compression="7z", out_format="parquet")
+
+    try:
+        tester_full.call_wikiq("--diff", "--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    # Read the full output
+    full_output_path = os.path.join(tester_full.output, f"{SAILORMOON}.parquet")
+    full_table = pq.read_table(full_output_path)
+
+    # Get a revid about 1/3 through to use as the resume point
+    resume_idx = len(full_table) // 3
+    resume_revid = full_table.column("revid")[resume_idx].as_py()
+
+    print(f"Total revisions: {len(full_table)}, Resume point: {resume_idx}, Resume revid: {resume_revid}")
+
+    # Create a partial output by filtering the table to preserve the exact schema
+    tester_partial = WikiqTester(SAILORMOON, "resume_diff_partial", in_compression="7z", out_format="parquet")
+    partial_output_path = os.path.join(tester_partial.output, f"{SAILORMOON}.parquet")
+
+    # Create partial output by slicing the table
+    partial_table = full_table.slice(0, resume_idx + 1)
+    pq.write_table(partial_table, partial_output_path)
+
+    # Now resume from the partial output
+    try:
+        tester_partial.call_wikiq("--diff", "--fandom-2020", "--resume")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    # Read the resumed output
+    resumed_table = pq.read_table(partial_output_path)
+
+    # Convert to dataframes for comparison, sorting by revid
+    resumed_df = resumed_table.to_pandas().sort_values("revid").reset_index(drop=True)
+    full_df = full_table.to_pandas().sort_values("revid").reset_index(drop=True)
+
+    # Compare the dataframes
+    assert_frame_equal(resumed_df, full_df, check_like=True, check_dtype=False)
+
+    print(f"Resume with diff test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")