104 lines
3.5 KiB
Python
104 lines
3.5 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
from github_datapipe.core.config import GithubConfig
|
|
from github_datapipe.phases.phase1_repository_sampling.service import (
|
|
SampleReposOptions,
|
|
resolve_query,
|
|
sample_repositories,
|
|
)
|
|
|
|
|
|
def test_resolve_query_uses_default_when_missing() -> None:
|
|
""" Verifies that an empty or missing query string falls back to the config default. """
|
|
assert resolve_query(None) == GithubConfig.default_query
|
|
|
|
|
|
def test_resolve_query_uses_override_when_present() -> None:
|
|
""" Verifies that a user-provided query overrides the config default. """
|
|
assert resolve_query("stars:>50") == "stars:>50"
|
|
|
|
|
|
def test_sample_repositories_dedupes_and_persists(monkeypatch, tmp_path: Path) -> None:
|
|
"""
|
|
Tests the core data pipeline: ensures pagination works, duplicates are
|
|
ignored across pages, and data is correctly saved to the file system.
|
|
"""
|
|
|
|
# ==========================================
|
|
# ARRANGE
|
|
# ==========================================
|
|
monkeypatch.setenv("github_token", "token")
|
|
|
|
# Create a mock GitHub API that returns 3 total repos across 2 pages.
|
|
# Notice that 'repo-one' (ID 100) is returned twice to test deduplication.
|
|
class FakeSampler:
|
|
def __init__(self, token: str, session: requests.Session | None = None) -> None:
|
|
self.token = token
|
|
|
|
def search_repositories(self, query: str, page: int, per_page: int) -> dict:
|
|
if page == 1:
|
|
return {
|
|
"total_count": 3,
|
|
"items": [
|
|
fake_repo(100, "owner/repo-one"),
|
|
fake_repo(100, "owner/repo-one"),
|
|
fake_repo(101, "owner/repo-two"),
|
|
],
|
|
}
|
|
return {"total_count": 3, "items": [fake_repo(102, "owner/repo-three")]}
|
|
|
|
# Inject the mock sampler into the main code
|
|
monkeypatch.setattr("github_datapipe.phases.phase1_repository_sampling.service.GithubApiClient", FakeSampler)
|
|
|
|
options = SampleReposOptions(
|
|
count=3,
|
|
output_root=tmp_path, # Safely write files to a temporary test folder
|
|
query="stars:>10",
|
|
mode="append-deduped",
|
|
run_id="run-test",
|
|
)
|
|
|
|
# ==========================================
|
|
# ACT
|
|
# ==========================================
|
|
|
|
result = sample_repositories(options)
|
|
|
|
# ==========================================
|
|
# ASSERT
|
|
# ==========================================
|
|
repos_path = Path(result["repos_path"])
|
|
rows = [json.loads(line) for line in repos_path.read_text(encoding="utf-8").splitlines()]
|
|
assert len(rows) == 3
|
|
assert [row["repo_id"] for row in rows] == [100, 101, 102]
|
|
|
|
seen_path = tmp_path / "seen_repo_ids.json"
|
|
seen_payload = json.loads(seen_path.read_text(encoding="utf-8"))
|
|
assert seen_payload == [100, 101, 102]
|
|
|
|
|
|
def fake_repo(repo_id: int, full_name: str) -> dict:
|
|
"""
|
|
Helper function to generate a mock GitHub repository payload.
|
|
Provides only the fields required by the `normalize_repo_record` function.
|
|
"""
|
|
return {
|
|
"id": repo_id,
|
|
"full_name": full_name,
|
|
"html_url": f"https://github.com/{full_name}",
|
|
"url": f"https://api.github.com/repos/{full_name}",
|
|
"default_branch": "main",
|
|
"language": "Python",
|
|
"description": f"Description for {full_name}",
|
|
"stargazers_count": 42,
|
|
"size": 2048,
|
|
"fork": False,
|
|
"archived": False,
|
|
"visibility": "public",
|
|
}
|