from __future__ import annotations import json from pathlib import Path import requests from github_datapipe.phases.phase2_commit_ingestion.service import ( FetchCommitsOptions, fetch_commits, ) def test_fetch_commits_writes_normalized_records_and_warning(monkeypatch, tmp_path: Path) -> None: monkeypatch.setenv("github_token", "token") repos_path = tmp_path / "run-test" / "phase1_repository_sampling" / "repos.jsonl" repos_path.parent.mkdir(parents=True, exist_ok=True) repos_path.write_text(json.dumps(fake_repo_record()) + "\n", encoding="utf-8") class FakeClient: def __init__(self, token: str, session: requests.Session | None = None) -> None: self.token = token def get_repository(self, full_name: str) -> dict: return {"full_name": full_name, "default_branch": "main"} def list_commits(self, full_name: str, branch: str, page: int, per_page: int) -> list[dict]: if page == 1: return [fake_commit_payload("a" * 40), fake_commit_payload("b" * 40)] return [] monkeypatch.setattr("github_datapipe.phases.phase2_commit_ingestion.service.GithubApiClient", FakeClient) result = fetch_commits( FetchCommitsOptions( output_root=tmp_path, run_id="run-test", mode="refresh", per_page=2, max_pages_per_repo=1, retry_count=0, ) ) commits_path = Path(result["commits_path"]) commit_rows = [json.loads(line) for line in commits_path.read_text(encoding="utf-8").splitlines()] assert len(commit_rows) == 2 assert commit_rows[0]["commit_key"] == f"{fake_repo_record()['repo_id']}:{'a' * 40}" status_path = Path(result["status_path"]) status_rows = [json.loads(line) for line in status_path.read_text(encoding="utf-8").splitlines()] assert status_rows[0]["status"] == "success_with_warning" assert status_rows[0]["truncated"] is True def test_fetch_commits_resume_skips_completed_repos(monkeypatch, tmp_path: Path) -> None: monkeypatch.setenv("github_token", "token") run_root = tmp_path / "run-test" repos_path = run_root / "phase1_repository_sampling" / "repos.jsonl" repos_path.parent.mkdir(parents=True, exist_ok=True) repos_path.write_text(json.dumps(fake_repo_record()) + "\n", encoding="utf-8") phase2_root = run_root / "phase2_commit_ingestion" phase2_root.mkdir(parents=True, exist_ok=True) (phase2_root / "repo_status.jsonl").write_text( json.dumps( { "repo_id": fake_repo_record()["repo_id"], "status": "complete", } ) + "\n", encoding="utf-8", ) class FakeClient: def __init__(self, token: str, session: requests.Session | None = None) -> None: raise AssertionError("Client should not be initialized for already completed repos") monkeypatch.setattr("github_datapipe.phases.phase2_commit_ingestion.service.GithubApiClient", FakeClient) result = fetch_commits( FetchCommitsOptions( output_root=tmp_path, run_id="run-test", mode="resume", retry_count=0, ) ) assert result["processed_repositories"] == 0 def fake_repo_record() -> dict: return { "run_id": "run-test", "repo_id": 100, "full_name": "owner/repo-one", "html_url": "https://github.com/owner/repo-one", } def fake_commit_payload(sha: str) -> dict: return { "sha": sha, "html_url": f"https://github.com/owner/repo-one/commit/{sha}", "parents": [{"sha": "parent-sha"}], "commit": { "author": { "name": "Alice", "email": "alice@example.com", "date": "2024-01-01T00:00:00Z", }, "committer": { "name": "Bob", "email": "bob@example.com", "date": "2024-01-01T00:00:00Z", }, "message": "Initial commit", }, }