Add phase 2 - download default branch commits for repositories collected in phase 1
This commit is contained in:
121
tests/test_phase2.py
Normal file
121
tests/test_phase2.py
Normal file
@@ -0,0 +1,121 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from github_datapipe.phases.phase2_commit_ingestion.service import (
|
||||
FetchCommitsOptions,
|
||||
fetch_commits,
|
||||
)
|
||||
|
||||
|
||||
def test_fetch_commits_writes_normalized_records_and_warning(monkeypatch, tmp_path: Path) -> None:
|
||||
monkeypatch.setenv("github_token", "token")
|
||||
repos_path = tmp_path / "run-test" / "phase1_repository_sampling" / "repos.jsonl"
|
||||
repos_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
repos_path.write_text(json.dumps(fake_repo_record()) + "\n", encoding="utf-8")
|
||||
|
||||
class FakeClient:
|
||||
def __init__(self, token: str, session: requests.Session | None = None) -> None:
|
||||
self.token = token
|
||||
|
||||
def get_repository(self, full_name: str) -> dict:
|
||||
return {"full_name": full_name, "default_branch": "main"}
|
||||
|
||||
def list_commits(self, full_name: str, branch: str, page: int, per_page: int) -> list[dict]:
|
||||
if page == 1:
|
||||
return [fake_commit_payload("a" * 40), fake_commit_payload("b" * 40)]
|
||||
return []
|
||||
|
||||
monkeypatch.setattr("github_datapipe.phases.phase2_commit_ingestion.service.GithubApiClient", FakeClient)
|
||||
|
||||
result = fetch_commits(
|
||||
FetchCommitsOptions(
|
||||
output_root=tmp_path,
|
||||
run_id="run-test",
|
||||
mode="refresh",
|
||||
per_page=2,
|
||||
max_pages_per_repo=1,
|
||||
retry_count=0,
|
||||
)
|
||||
)
|
||||
|
||||
commits_path = Path(result["commits_path"])
|
||||
commit_rows = [json.loads(line) for line in commits_path.read_text(encoding="utf-8").splitlines()]
|
||||
assert len(commit_rows) == 2
|
||||
assert commit_rows[0]["commit_key"] == f"{fake_repo_record()['repo_id']}:{'a' * 40}"
|
||||
|
||||
status_path = Path(result["status_path"])
|
||||
status_rows = [json.loads(line) for line in status_path.read_text(encoding="utf-8").splitlines()]
|
||||
assert status_rows[0]["status"] == "success_with_warning"
|
||||
assert status_rows[0]["truncated"] is True
|
||||
|
||||
|
||||
def test_fetch_commits_resume_skips_completed_repos(monkeypatch, tmp_path: Path) -> None:
|
||||
monkeypatch.setenv("github_token", "token")
|
||||
run_root = tmp_path / "run-test"
|
||||
repos_path = run_root / "phase1_repository_sampling" / "repos.jsonl"
|
||||
repos_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
repos_path.write_text(json.dumps(fake_repo_record()) + "\n", encoding="utf-8")
|
||||
|
||||
phase2_root = run_root / "phase2_commit_ingestion"
|
||||
phase2_root.mkdir(parents=True, exist_ok=True)
|
||||
(phase2_root / "repo_status.jsonl").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"repo_id": fake_repo_record()["repo_id"],
|
||||
"status": "complete",
|
||||
}
|
||||
)
|
||||
+ "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
class FakeClient:
|
||||
def __init__(self, token: str, session: requests.Session | None = None) -> None:
|
||||
raise AssertionError("Client should not be initialized for already completed repos")
|
||||
|
||||
monkeypatch.setattr("github_datapipe.phases.phase2_commit_ingestion.service.GithubApiClient", FakeClient)
|
||||
|
||||
result = fetch_commits(
|
||||
FetchCommitsOptions(
|
||||
output_root=tmp_path,
|
||||
run_id="run-test",
|
||||
mode="resume",
|
||||
retry_count=0,
|
||||
)
|
||||
)
|
||||
|
||||
assert result["processed_repositories"] == 0
|
||||
|
||||
|
||||
def fake_repo_record() -> dict:
|
||||
return {
|
||||
"run_id": "run-test",
|
||||
"repo_id": 100,
|
||||
"full_name": "owner/repo-one",
|
||||
"html_url": "https://github.com/owner/repo-one",
|
||||
}
|
||||
|
||||
|
||||
def fake_commit_payload(sha: str) -> dict:
|
||||
return {
|
||||
"sha": sha,
|
||||
"html_url": f"https://github.com/owner/repo-one/commit/{sha}",
|
||||
"parents": [{"sha": "parent-sha"}],
|
||||
"commit": {
|
||||
"author": {
|
||||
"name": "Alice",
|
||||
"email": "alice@example.com",
|
||||
"date": "2024-01-01T00:00:00Z",
|
||||
},
|
||||
"committer": {
|
||||
"name": "Bob",
|
||||
"email": "bob@example.com",
|
||||
"date": "2024-01-01T00:00:00Z",
|
||||
},
|
||||
"message": "Initial commit",
|
||||
},
|
||||
}
|
||||
Reference in New Issue
Block a user