1
0
Files
githubDataSampler/tests/test_phase1.py

104 lines
3.5 KiB
Python

from __future__ import annotations
import json
from pathlib import Path
import requests
from github_datapipe.core.config import GithubConfig
from github_datapipe.phases.phase1_repository_sampling.service import (
SampleReposOptions,
resolve_query,
sample_repositories,
)
def test_resolve_query_uses_default_when_missing() -> None:
""" Verifies that an empty or missing query string falls back to the config default. """
assert resolve_query(None) == GithubConfig.default_query
def test_resolve_query_uses_override_when_present() -> None:
""" Verifies that a user-provided query overrides the config default. """
assert resolve_query("stars:>50") == "stars:>50"
def test_sample_repositories_dedupes_and_persists(monkeypatch, tmp_path: Path) -> None:
"""
Tests the core data pipeline: ensures pagination works, duplicates are
ignored across pages, and data is correctly saved to the file system.
"""
# ==========================================
# ARRANGE
# ==========================================
monkeypatch.setenv("github_token", "token")
# Create a mock GitHub API that returns 3 total repos across 2 pages.
# Notice that 'repo-one' (ID 100) is returned twice to test deduplication.
class FakeSampler:
def __init__(self, token: str, session: requests.Session | None = None) -> None:
self.token = token
def search_repositories(self, query: str, page: int, per_page: int) -> dict:
if page == 1:
return {
"total_count": 3,
"items": [
fake_repo(100, "owner/repo-one"),
fake_repo(100, "owner/repo-one"),
fake_repo(101, "owner/repo-two"),
],
}
return {"total_count": 3, "items": [fake_repo(102, "owner/repo-three")]}
# Inject the mock sampler into the main code
monkeypatch.setattr("github_datapipe.phases.phase1_repository_sampling.service.GithubApiClient", FakeSampler)
options = SampleReposOptions(
count=3,
output_root=tmp_path, # Safely write files to a temporary test folder
query="stars:>10",
mode="append-deduped",
run_id="run-test",
)
# ==========================================
# ACT
# ==========================================
result = sample_repositories(options)
# ==========================================
# ASSERT
# ==========================================
repos_path = Path(result["repos_path"])
rows = [json.loads(line) for line in repos_path.read_text(encoding="utf-8").splitlines()]
assert len(rows) == 3
assert [row["repo_id"] for row in rows] == [100, 101, 102]
seen_path = tmp_path / "seen_repo_ids.json"
seen_payload = json.loads(seen_path.read_text(encoding="utf-8"))
assert seen_payload == [100, 101, 102]
def fake_repo(repo_id: int, full_name: str) -> dict:
"""
Helper function to generate a mock GitHub repository payload.
Provides only the fields required by the `normalize_repo_record` function.
"""
return {
"id": repo_id,
"full_name": full_name,
"html_url": f"https://github.com/{full_name}",
"url": f"https://api.github.com/repos/{full_name}",
"default_branch": "main",
"language": "Python",
"description": f"Description for {full_name}",
"stargazers_count": 42,
"size": 2048,
"fork": False,
"archived": False,
"visibility": "public",
}