122 lines
4.0 KiB
Python
122 lines
4.0 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
from github_datapipe.phases.phase2_commit_ingestion.service import (
|
|
FetchCommitsOptions,
|
|
fetch_commits,
|
|
)
|
|
|
|
|
|
def test_fetch_commits_writes_normalized_records_and_warning(monkeypatch, tmp_path: Path) -> None:
|
|
monkeypatch.setenv("github_token", "token")
|
|
repos_path = tmp_path / "run-test" / "phase1_repository_sampling" / "repos.jsonl"
|
|
repos_path.parent.mkdir(parents=True, exist_ok=True)
|
|
repos_path.write_text(json.dumps(fake_repo_record()) + "\n", encoding="utf-8")
|
|
|
|
class FakeClient:
|
|
def __init__(self, token: str, session: requests.Session | None = None) -> None:
|
|
self.token = token
|
|
|
|
def get_repository(self, full_name: str) -> dict:
|
|
return {"full_name": full_name, "default_branch": "main"}
|
|
|
|
def list_commits(self, full_name: str, branch: str, page: int, per_page: int) -> list[dict]:
|
|
if page == 1:
|
|
return [fake_commit_payload("a" * 40), fake_commit_payload("b" * 40)]
|
|
return []
|
|
|
|
monkeypatch.setattr("github_datapipe.phases.phase2_commit_ingestion.service.GithubApiClient", FakeClient)
|
|
|
|
result = fetch_commits(
|
|
FetchCommitsOptions(
|
|
output_root=tmp_path,
|
|
run_id="run-test",
|
|
mode="refresh",
|
|
per_page=2,
|
|
max_pages_per_repo=1,
|
|
retry_count=0,
|
|
)
|
|
)
|
|
|
|
commits_path = Path(result["commits_path"])
|
|
commit_rows = [json.loads(line) for line in commits_path.read_text(encoding="utf-8").splitlines()]
|
|
assert len(commit_rows) == 2
|
|
assert commit_rows[0]["commit_key"] == f"{fake_repo_record()['repo_id']}:{'a' * 40}"
|
|
|
|
status_path = Path(result["status_path"])
|
|
status_rows = [json.loads(line) for line in status_path.read_text(encoding="utf-8").splitlines()]
|
|
assert status_rows[0]["status"] == "success_with_warning"
|
|
assert status_rows[0]["truncated"] is True
|
|
|
|
|
|
def test_fetch_commits_resume_skips_completed_repos(monkeypatch, tmp_path: Path) -> None:
|
|
monkeypatch.setenv("github_token", "token")
|
|
run_root = tmp_path / "run-test"
|
|
repos_path = run_root / "phase1_repository_sampling" / "repos.jsonl"
|
|
repos_path.parent.mkdir(parents=True, exist_ok=True)
|
|
repos_path.write_text(json.dumps(fake_repo_record()) + "\n", encoding="utf-8")
|
|
|
|
phase2_root = run_root / "phase2_commit_ingestion"
|
|
phase2_root.mkdir(parents=True, exist_ok=True)
|
|
(phase2_root / "repo_status.jsonl").write_text(
|
|
json.dumps(
|
|
{
|
|
"repo_id": fake_repo_record()["repo_id"],
|
|
"status": "complete",
|
|
}
|
|
)
|
|
+ "\n",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
class FakeClient:
|
|
def __init__(self, token: str, session: requests.Session | None = None) -> None:
|
|
raise AssertionError("Client should not be initialized for already completed repos")
|
|
|
|
monkeypatch.setattr("github_datapipe.phases.phase2_commit_ingestion.service.GithubApiClient", FakeClient)
|
|
|
|
result = fetch_commits(
|
|
FetchCommitsOptions(
|
|
output_root=tmp_path,
|
|
run_id="run-test",
|
|
mode="resume",
|
|
retry_count=0,
|
|
)
|
|
)
|
|
|
|
assert result["processed_repositories"] == 0
|
|
|
|
|
|
def fake_repo_record() -> dict:
|
|
return {
|
|
"run_id": "run-test",
|
|
"repo_id": 100,
|
|
"full_name": "owner/repo-one",
|
|
"html_url": "https://github.com/owner/repo-one",
|
|
}
|
|
|
|
|
|
def fake_commit_payload(sha: str) -> dict:
|
|
return {
|
|
"sha": sha,
|
|
"html_url": f"https://github.com/owner/repo-one/commit/{sha}",
|
|
"parents": [{"sha": "parent-sha"}],
|
|
"commit": {
|
|
"author": {
|
|
"name": "Alice",
|
|
"email": "alice@example.com",
|
|
"date": "2024-01-01T00:00:00Z",
|
|
},
|
|
"committer": {
|
|
"name": "Bob",
|
|
"email": "bob@example.com",
|
|
"date": "2024-01-01T00:00:00Z",
|
|
},
|
|
"message": "Initial commit",
|
|
},
|
|
}
|