1
0

Refactored to scale cli with click library

This commit is contained in:
HBrahmbhatt
2026-04-24 13:08:24 -07:00
parent 98696ddb29
commit f2677cb1ad
10 changed files with 377 additions and 145 deletions

View File

@@ -1,3 +1,10 @@
"""
Phase 1: Repository Sampling Service.
This service is responsible for searching GitHub for repositories based on a query,
filtering out previously seen repositories (if requested), and saving the results.
"""
from __future__ import annotations
from dataclasses import dataclass
@@ -12,6 +19,7 @@ from github_datapipe.core.runtime import build_run_id, require_github_token, utc
@dataclass(frozen=True)
class SampleReposOptions:
"""Options for the repository sampling phase."""
count: int = GithubConfig.default_repo_count
output_root: Path = Path(GithubConfig.default_output_root)
query: str = GithubConfig.default_query
@@ -21,10 +29,23 @@ class SampleReposOptions:
def resolve_query(query_override: str | None) -> str:
"""Resolve the search query, using the override if provided, else the default."""
return query_override.strip() if query_override else GithubConfig.default_query
def sample_repositories(options: SampleReposOptions) -> dict[str, Any]:
"""
Execute Phase 1: Sample repositories from GitHub.
Searches for repositories, handles pagination and deduplication, and saves
the collected repository data and a run manifest.
Args:
options: Configuration options for sampling.
Returns:
A dictionary containing the run ID and paths to generated files.
"""
token = require_github_token()
if options.count <= 0:
raise ValueError("`count` must be greater than 0.")
@@ -121,6 +142,7 @@ def normalize_repo_record(
sample_page: int,
sampled_at: str,
) -> dict[str, Any]:
"""Normalize a raw GitHub repository payload into our internal schema."""
return {
"run_id": run_id,
"repo_id": repo["id"],
@@ -142,6 +164,7 @@ def normalize_repo_record(
def load_seen_repo_ids(path: Path) -> set[int]:
"""Load the set of repository IDs that have already been sampled."""
if not path.exists():
return set()
payload = read_json(path)