Refactored to scale cli with click library
This commit is contained in:
@@ -1,3 +1,10 @@
|
||||
"""
|
||||
Phase 1: Repository Sampling Service.
|
||||
|
||||
This service is responsible for searching GitHub for repositories based on a query,
|
||||
filtering out previously seen repositories (if requested), and saving the results.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
@@ -12,6 +19,7 @@ from github_datapipe.core.runtime import build_run_id, require_github_token, utc
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SampleReposOptions:
|
||||
"""Options for the repository sampling phase."""
|
||||
count: int = GithubConfig.default_repo_count
|
||||
output_root: Path = Path(GithubConfig.default_output_root)
|
||||
query: str = GithubConfig.default_query
|
||||
@@ -21,10 +29,23 @@ class SampleReposOptions:
|
||||
|
||||
|
||||
def resolve_query(query_override: str | None) -> str:
|
||||
"""Resolve the search query, using the override if provided, else the default."""
|
||||
return query_override.strip() if query_override else GithubConfig.default_query
|
||||
|
||||
|
||||
def sample_repositories(options: SampleReposOptions) -> dict[str, Any]:
|
||||
"""
|
||||
Execute Phase 1: Sample repositories from GitHub.
|
||||
|
||||
Searches for repositories, handles pagination and deduplication, and saves
|
||||
the collected repository data and a run manifest.
|
||||
|
||||
Args:
|
||||
options: Configuration options for sampling.
|
||||
|
||||
Returns:
|
||||
A dictionary containing the run ID and paths to generated files.
|
||||
"""
|
||||
token = require_github_token()
|
||||
if options.count <= 0:
|
||||
raise ValueError("`count` must be greater than 0.")
|
||||
@@ -121,6 +142,7 @@ def normalize_repo_record(
|
||||
sample_page: int,
|
||||
sampled_at: str,
|
||||
) -> dict[str, Any]:
|
||||
"""Normalize a raw GitHub repository payload into our internal schema."""
|
||||
return {
|
||||
"run_id": run_id,
|
||||
"repo_id": repo["id"],
|
||||
@@ -142,6 +164,7 @@ def normalize_repo_record(
|
||||
|
||||
|
||||
def load_seen_repo_ids(path: Path) -> set[int]:
|
||||
"""Load the set of repository IDs that have already been sampled."""
|
||||
if not path.exists():
|
||||
return set()
|
||||
payload = read_json(path)
|
||||
|
||||
Reference in New Issue
Block a user