from __future__ import annotations import json from pathlib import Path import requests from github_datapipe.config import GithubConfig from github_datapipe.extract_repos import SampleReposOptions, resolve_query, sample_repositories def test_resolve_query_uses_default_when_missing() -> None: """ Verifies that an empty or missing query string falls back to the config default. """ assert resolve_query(None) == GithubConfig.default_query def test_resolve_query_uses_override_when_present() -> None: """ Verifies that a user-provided query overrides the config default. """ assert resolve_query("stars:>50") == "stars:>50" def test_sample_repositories_dedupes_and_persists(monkeypatch, tmp_path: Path) -> None: """ Tests the core data pipeline: ensures pagination works, duplicates are ignored across pages, and data is correctly saved to the file system. """ # ========================================== # ARRANGE # ========================================== monkeypatch.setenv("github_token", "token") # Create a mock GitHub API that returns 3 total repos across 2 pages. # Notice that 'repo-one' (ID 100) is returned twice to test deduplication. class FakeSampler: def __init__(self, token: str, session: requests.Session | None = None) -> None: self.token = token def search_repositories(self, query: str, page: int, per_page: int) -> dict: if page == 1: return { "total_count": 3, "items": [ fake_repo(100, "owner/repo-one"), fake_repo(100, "owner/repo-one"), fake_repo(101, "owner/repo-two"), ], } return {"total_count": 3, "items": [fake_repo(102, "owner/repo-three")]} # Inject the mock sampler into the main code monkeypatch.setattr("github_datapipe.extract_repos.GithubRepoSampler", FakeSampler) options = SampleReposOptions( count=3, output_root=tmp_path, # Safely write files to a temporary test folder query="stars:>10", mode="append-deduped", run_id="run-test", ) # ========================================== # ACT # ========================================== result = sample_repositories(options) # ========================================== # ASSERT # ========================================== repos_path = Path(result["repos_path"]) rows = [json.loads(line) for line in repos_path.read_text(encoding="utf-8").splitlines()] assert len(rows) == 3 assert [row["repo_id"] for row in rows] == [100, 101, 102] seen_path = tmp_path / "seen_repo_ids.json" seen_payload = json.loads(seen_path.read_text(encoding="utf-8")) assert seen_payload == [100, 101, 102] def fake_repo(repo_id: int, full_name: str) -> dict: """ Helper function to generate a mock GitHub repository payload. Provides only the fields required by the `normalize_repo_record` function. """ return { "id": repo_id, "full_name": full_name, "html_url": f"https://github.com/{full_name}", "url": f"https://api.github.com/repos/{full_name}", "default_branch": "main", "language": "Python", "description": f"Description for {full_name}", "stargazers_count": 42, "size": 2048, "fork": False, "archived": False, "visibility": "public", }