1
0
Files
githubDataSampler/tests/test_phase2.py

122 lines
4.0 KiB
Python

from __future__ import annotations
import json
from pathlib import Path
import requests
from github_datapipe.phases.phase2_commit_ingestion.service import (
FetchCommitsOptions,
fetch_commits,
)
def test_fetch_commits_writes_normalized_records_and_warning(monkeypatch, tmp_path: Path) -> None:
monkeypatch.setenv("github_token", "token")
repos_path = tmp_path / "run-test" / "phase1_repository_sampling" / "repos.jsonl"
repos_path.parent.mkdir(parents=True, exist_ok=True)
repos_path.write_text(json.dumps(fake_repo_record()) + "\n", encoding="utf-8")
class FakeClient:
def __init__(self, token: str, session: requests.Session | None = None) -> None:
self.token = token
def get_repository(self, full_name: str) -> dict:
return {"full_name": full_name, "default_branch": "main"}
def list_commits(self, full_name: str, branch: str, page: int, per_page: int) -> list[dict]:
if page == 1:
return [fake_commit_payload("a" * 40), fake_commit_payload("b" * 40)]
return []
monkeypatch.setattr("github_datapipe.phases.phase2_commit_ingestion.service.GithubApiClient", FakeClient)
result = fetch_commits(
FetchCommitsOptions(
output_root=tmp_path,
run_id="run-test",
mode="refresh",
per_page=2,
max_pages_per_repo=1,
retry_count=0,
)
)
commits_path = Path(result["commits_path"])
commit_rows = [json.loads(line) for line in commits_path.read_text(encoding="utf-8").splitlines()]
assert len(commit_rows) == 2
assert commit_rows[0]["commit_key"] == f"{fake_repo_record()['repo_id']}:{'a' * 40}"
status_path = Path(result["status_path"])
status_rows = [json.loads(line) for line in status_path.read_text(encoding="utf-8").splitlines()]
assert status_rows[0]["status"] == "success_with_warning"
assert status_rows[0]["truncated"] is True
def test_fetch_commits_resume_skips_completed_repos(monkeypatch, tmp_path: Path) -> None:
monkeypatch.setenv("github_token", "token")
run_root = tmp_path / "run-test"
repos_path = run_root / "phase1_repository_sampling" / "repos.jsonl"
repos_path.parent.mkdir(parents=True, exist_ok=True)
repos_path.write_text(json.dumps(fake_repo_record()) + "\n", encoding="utf-8")
phase2_root = run_root / "phase2_commit_ingestion"
phase2_root.mkdir(parents=True, exist_ok=True)
(phase2_root / "repo_status.jsonl").write_text(
json.dumps(
{
"repo_id": fake_repo_record()["repo_id"],
"status": "complete",
}
)
+ "\n",
encoding="utf-8",
)
class FakeClient:
def __init__(self, token: str, session: requests.Session | None = None) -> None:
raise AssertionError("Client should not be initialized for already completed repos")
monkeypatch.setattr("github_datapipe.phases.phase2_commit_ingestion.service.GithubApiClient", FakeClient)
result = fetch_commits(
FetchCommitsOptions(
output_root=tmp_path,
run_id="run-test",
mode="resume",
retry_count=0,
)
)
assert result["processed_repositories"] == 0
def fake_repo_record() -> dict:
return {
"run_id": "run-test",
"repo_id": 100,
"full_name": "owner/repo-one",
"html_url": "https://github.com/owner/repo-one",
}
def fake_commit_payload(sha: str) -> dict:
return {
"sha": sha,
"html_url": f"https://github.com/owner/repo-one/commit/{sha}",
"parents": [{"sha": "parent-sha"}],
"commit": {
"author": {
"name": "Alice",
"email": "alice@example.com",
"date": "2024-01-01T00:00:00Z",
},
"committer": {
"name": "Bob",
"email": "bob@example.com",
"date": "2024-01-01T00:00:00Z",
},
"message": "Initial commit",
},
}