Add phase 1 GitHub repo sampling pipeline
This commit is contained in:
99
tests/test_phase1.py
Normal file
99
tests/test_phase1.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from github_datapipe.config import GithubConfig
|
||||
from github_datapipe.extract_repos import SampleReposOptions, resolve_query, sample_repositories
|
||||
|
||||
|
||||
def test_resolve_query_uses_default_when_missing() -> None:
|
||||
""" Verifies that an empty or missing query string falls back to the config default. """
|
||||
assert resolve_query(None) == GithubConfig.default_query
|
||||
|
||||
|
||||
def test_resolve_query_uses_override_when_present() -> None:
|
||||
""" Verifies that a user-provided query overrides the config default. """
|
||||
assert resolve_query("stars:>50") == "stars:>50"
|
||||
|
||||
|
||||
def test_sample_repositories_dedupes_and_persists(monkeypatch, tmp_path: Path) -> None:
|
||||
"""
|
||||
Tests the core data pipeline: ensures pagination works, duplicates are
|
||||
ignored across pages, and data is correctly saved to the file system.
|
||||
"""
|
||||
|
||||
# ==========================================
|
||||
# ARRANGE
|
||||
# ==========================================
|
||||
monkeypatch.setenv("github_token", "token")
|
||||
|
||||
# Create a mock GitHub API that returns 3 total repos across 2 pages.
|
||||
# Notice that 'repo-one' (ID 100) is returned twice to test deduplication.
|
||||
class FakeSampler:
|
||||
def __init__(self, token: str, session: requests.Session | None = None) -> None:
|
||||
self.token = token
|
||||
|
||||
def search_repositories(self, query: str, page: int, per_page: int) -> dict:
|
||||
if page == 1:
|
||||
return {
|
||||
"total_count": 3,
|
||||
"items": [
|
||||
fake_repo(100, "owner/repo-one"),
|
||||
fake_repo(100, "owner/repo-one"),
|
||||
fake_repo(101, "owner/repo-two"),
|
||||
],
|
||||
}
|
||||
return {"total_count": 3, "items": [fake_repo(102, "owner/repo-three")]}
|
||||
|
||||
# Inject the mock sampler into the main code
|
||||
monkeypatch.setattr("github_datapipe.extract_repos.GithubRepoSampler", FakeSampler)
|
||||
|
||||
options = SampleReposOptions(
|
||||
count=3,
|
||||
output_root=tmp_path, # Safely write files to a temporary test folder
|
||||
query="stars:>10",
|
||||
mode="append-deduped",
|
||||
run_id="run-test",
|
||||
)
|
||||
|
||||
# ==========================================
|
||||
# ACT
|
||||
# ==========================================
|
||||
|
||||
result = sample_repositories(options)
|
||||
|
||||
# ==========================================
|
||||
# ASSERT
|
||||
# ==========================================
|
||||
repos_path = Path(result["repos_path"])
|
||||
rows = [json.loads(line) for line in repos_path.read_text(encoding="utf-8").splitlines()]
|
||||
assert len(rows) == 3
|
||||
assert [row["repo_id"] for row in rows] == [100, 101, 102]
|
||||
|
||||
seen_path = tmp_path / "seen_repo_ids.json"
|
||||
seen_payload = json.loads(seen_path.read_text(encoding="utf-8"))
|
||||
assert seen_payload == [100, 101, 102]
|
||||
|
||||
|
||||
def fake_repo(repo_id: int, full_name: str) -> dict:
|
||||
"""
|
||||
Helper function to generate a mock GitHub repository payload.
|
||||
Provides only the fields required by the `normalize_repo_record` function.
|
||||
"""
|
||||
return {
|
||||
"id": repo_id,
|
||||
"full_name": full_name,
|
||||
"html_url": f"https://github.com/{full_name}",
|
||||
"url": f"https://api.github.com/repos/{full_name}",
|
||||
"default_branch": "main",
|
||||
"language": "Python",
|
||||
"description": f"Description for {full_name}",
|
||||
"stargazers_count": 42,
|
||||
"size": 2048,
|
||||
"fork": False,
|
||||
"archived": False,
|
||||
"visibility": "public",
|
||||
}
|
||||
Reference in New Issue
Block a user