Refactored to scale cli with click library

2026-04-24 13:08:24 -07:00
parent 98696ddb29
commit f2677cb1ad
10 changed files with 377 additions and 145 deletions
--- a/README.md
+++ b/README.md
@@ -36,6 +36,21 @@ From the project root, install the local environment with:
 poetry install
 ```

+## CLI Help
+
+To see the available commands and their descriptions, run:
+
+```powershell
+poetry run github-datapipe --help
+```
+
+To see the available arguments and options for a specific command, use the `--help` flag with that command:
+
+```powershell
+poetry run github-datapipe sample-repos --help
+poetry run github-datapipe fetch-commits --help
+```
+
 ## Run Phase 1

 The default phase 1 command collects 10 repositories using the default search query from `src/github_datapipe/core/config.py`.
--- a/poetry.lock
+++ b/poetry.lock
@@ -151,6 +151,21 @@ files = [
    {file = "charset_normalizer-3.4.7.tar.gz", hash = "sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5"},
 ]

+[[package]]
+name = "click"
+version = "8.3.3"
+description = "Composable command line interface toolkit"
+optional = false
+python-versions = ">=3.10"
+groups = ["main"]
+files = [
+    {file = "click-8.3.3-py3-none-any.whl", hash = "sha256:a2bf429bb3033c89fa4936ffb35d5cb471e3719e1f3c8a7c3fff0b8314305613"},
+    {file = "click-8.3.3.tar.gz", hash = "sha256:398329ad4837b2ff7cbe1dd166a4c0f8900c3ca3a218de04466f38f6497f18a2"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -158,7 +173,7 @@ description = "Cross-platform colored terminal text."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
 groups = ["main"]
-markers = "sys_platform == \"win32\""
+markers = "sys_platform == \"win32\" or platform_system == \"Windows\""
 files = [
    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
@@ -606,4 +621,4 @@ zstd = ["backports-zstd (>=1.0.0) ; python_version < \"3.14\""]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.12"
-content-hash = "3c8765a0ae32ec30eec764e892bef0c0e87524d351f88387e91b78c4b604d112"
+content-hash = "43a523cd40c7fc6a64072c377740862a41c1fd19a2be575974aa319838191e02"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,8 @@ dependencies = [
    "pyarrow (>=23.0.1,<24.0.0)",
    "python-dotenv (>=1.2.2,<2.0.0)",
    "pytest (>=9.0.3,<10.0.0)",
-    "requests-mock (>=1.12.1,<2.0.0)"
+    "requests-mock (>=1.12.1,<2.0.0)",
+    "click (>=8.3.3,<9.0.0)"
 ]

 [project.scripts]
--- a/src/github_datapipe/cli.py
+++ b/src/github_datapipe/cli.py
@@ -1,6 +1,13 @@
+"""
+Command-line interface for the GitHub data pipeline.
+
+This module provides the entry point for the CLI, allowing users to run different
+phases of the pipeline, such as repository sampling and commit ingestion.
+"""
+
 from __future__ import annotations

-import argparse
+import click
 from pathlib import Path

 from github_datapipe.core.config import GithubConfig
@@ -17,149 +24,172 @@ from github_datapipe.phases.phase2_commit_ingestion.service import (
 )


-def build_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(description="GitHub data pipeline CLI")
-    subparsers = parser.add_subparsers(dest="command", required=True)
+@click.group()
+def main() -> None:
+    """
+    GitHub data pipeline CLI.

-    sample_parser = subparsers.add_parser(
-        "sample-repos",
-        help="Collect repositories from GitHub Search and save phase 1 outputs.",
-    )
-    sample_parser.add_argument(
-        "--count",
-        type=int,
-        default=GithubConfig.default_repo_count,
-        help=f"Number of repositories to sample. Defaults to {GithubConfig.default_repo_count}.",
-    )
-    sample_parser.add_argument(
-        "--query",
-        type=str,
-        default=None,
-        help="Optional raw GitHub repository search query. Replaces config defaults when provided.",
-    )
-    sample_parser.add_argument(
-        "--output-root",
-        type=Path,
-        default=Path(GithubConfig.default_output_root),
-        help=f"Directory where run outputs are stored. Defaults to `{GithubConfig.default_output_root}`.",
-    )
-    sample_parser.add_argument(
-        "--mode",
-        choices=("append-deduped", "fresh"),
-        default="append-deduped",
-        help="Whether to dedupe against the persisted seen-repo index or start a fresh phase-1 run.",
-    )
-    sample_parser.add_argument(
-        "--per-page",
-        type=int,
-        default=GithubConfig.default_per_page,
-        help=f"GitHub Search page size. Defaults to {GithubConfig.default_per_page}.",
-    )
-    sample_parser.add_argument(
-        "--run-id",
-        type=str,
-        default=None,
-        help="Optional run identifier. If omitted, a run id is generated automatically.",
-    )
-
-    commit_parser = subparsers.add_parser(
-        "fetch-commits",
-        help="Fetch commit history for repositories collected in phase 1.",
-    )
-    commit_input = commit_parser.add_mutually_exclusive_group(required=True)
-    commit_input.add_argument(
-        "--run-id",
-        type=str,
-        help="Run identifier whose phase 1 repository dataset should be consumed.",
-    )
-    commit_input.add_argument(
-        "--repos-file",
-        type=Path,
-        help="Path to a phase 1 repos.jsonl file.",
-    )
-    commit_parser.add_argument(
-        "--output-root",
-        type=Path,
-        default=Path(GithubConfig.default_output_root),
-        help=f"Directory where run outputs are stored. Defaults to `{GithubConfig.default_output_root}`.",
-    )
-    commit_parser.add_argument(
-        "--mode",
-        choices=("refresh", "resume"),
-        default="refresh",
-        help="Whether to fetch all repositories from scratch or skip those already marked complete.",
-    )
-    commit_parser.add_argument(
-        "--max-pages-per-repo",
-        type=int,
-        default=GithubConfig.default_max_pages_per_repo,
-        help=(
-            "Maximum number of commit pages to fetch per repository. "
-            f"Defaults to {GithubConfig.default_max_pages_per_repo}."
-        ),
-    )
-    commit_parser.add_argument(
-        "--per-page",
-        type=int,
-        default=GithubConfig.default_per_page,
-        help=f"GitHub commit page size. Defaults to {GithubConfig.default_per_page}.",
-    )
-    commit_parser.add_argument(
-        "--retry-count",
-        type=int,
-        default=GithubConfig.default_retry_count,
-        help=f"Number of retries for repository metadata and commit requests. Defaults to {GithubConfig.default_retry_count}.",
-    )
-
-    return parser
+    Use the subcommands to run different phases of the data extraction process.
+    """
+    pass


-def main() -> int:
-    parser = build_parser()
-    args = parser.parse_args()

-    if args.command == "sample-repos":
-        options = SampleReposOptions(
-            count=args.count,
-            output_root=args.output_root,
-            query=resolve_query(args.query),
-            per_page=args.per_page,
-            mode=args.mode,
-            run_id=args.run_id,
-        )
-        result = sample_repositories(options)
-        print(f"Run ID: {result['run_id']}")
-        print(f"Collected repositories: {result['count_collected']}")
-        print(f"Repositories file: {result['repos_path']}")
-        print(f"Manifest file: {result['manifest_path']}")
-        if result["seen_index_path"] is not None:
-            print(f"Seen repo index: {result['seen_index_path']}")
-        return 0
+"""

-    if args.command == "fetch-commits":
-        options = FetchCommitsOptions(
-            output_root=args.output_root,
-            run_id=args.run_id,
-            repos_file=args.repos_file,
-            mode=args.mode,
-            per_page=args.per_page,
-            max_pages_per_repo=args.max_pages_per_repo,
-            retry_count=args.retry_count,
-        )
-        result = fetch_commits(options)
-        print(f"Run ID: {result['run_id']}")
-        print(f"Processed repositories: {result['processed_repositories']}")
-        print(f"Completed repositories: {result['completed_repositories']}")
-        print(f"Warning repositories: {result['warning_repositories']}")
-        print(f"Failed repositories: {result['failed_repositories']}")
-        print(f"Commits file: {result['commits_path']}")
-        print(f"Status file: {result['status_path']}")
-        print(f"Manifest file: {result['manifest_path']}")
-        return 0
+"""
+@main.command(name="sample-repos")
+@click.option(
+    "--count",
+    type=int,
+    default=GithubConfig.default_repo_count,
+    help=f"Number of repositories to sample. Defaults to {GithubConfig.default_repo_count}.",
+)
+@click.option(
+    "--query",
+    type=str,
+    default=GithubConfig.default_query,
+    help="Optional raw GitHub repository search query. Replaces config defaults when provided.",
+)
+@click.option(
+    "--output-root",
+    type=click.Path(path_type=Path),
+    default=Path(GithubConfig.default_output_root),
+    help=f"Directory where run outputs are stored. Defaults to `{GithubConfig.default_output_root}`.",
+)
+@click.option(
+    "--mode",
+    type=click.Choice(["append-deduped", "fresh"]),
+    default="fresh",
+    help="Whether to dedupe against the persisted seen-repo index or start a fresh phase-1 run.",
+)
+@click.option(
+    "--per-page",
+    type=int,
+    default=GithubConfig.default_per_page,
+    help=f"GitHub Search page size. Defaults to {GithubConfig.default_per_page}.",
+)
+@click.option(
+    "--run-id",
+    type=str,
+    default=None,
+    help="Optional run identifier. If omitted, a run id is generated automatically.",
+)
+def sample_repos(
+    count: int,
+    query: str | None,
+    output_root: Path,
+    mode: str,
+    per_page: int,
+    run_id: str | None,
+) -> None:
+    """
+    Collect repositories from GitHub Search and save phase 1 outputs.

-    parser.error(f"Unsupported command: {args.command}")
-    return 1
+    This command searches for repositories matching criteria and saves their metadata
+    to a JSONL file for further processing.
+    """
+    options = SampleReposOptions(
+        count=count,
+        output_root=output_root,
+        query=resolve_query(query),
+        per_page=per_page,
+        mode=mode,
+        run_id=run_id,
+    )
+    result = sample_repositories(options)
+    click.echo(f"Run ID: {result['run_id']}")
+    click.echo(f"Collected repositories: {result['count_collected']}")
+    click.echo(f"Repositories file: {result['repos_path']}")
+    click.echo(f"Manifest file: {result['manifest_path']}")
+    if result["seen_index_path"] is not None:
+        click.echo(f"Seen repo index: {result['seen_index_path']}")
+
+
+@main.command(name="fetch-commits")
+@click.option(
+    "--run-id",
+    type=str,
+    required = True,
+    help="Run identifier whose phase 1 repository dataset should be consumed.",
+)
+@click.option(
+    "--repos-file",
+    type=click.Path(exists=True, path_type=Path),
+    help="Path to a phase 1 repos.jsonl file.",
+)
+@click.option(
+    "--output-root",
+    type=click.Path(path_type=Path),
+    default=Path(GithubConfig.default_output_root),
+    help=f"Directory where run outputs are stored. Defaults to `{GithubConfig.default_output_root}`.",
+)
+@click.option(
+    "--mode",
+    type=click.Choice(["refresh", "resume"]),
+    default="refresh",
+    help="Whether to fetch all repositories from scratch or skip those already marked complete.",
+)
+@click.option(
+    "--max-pages-per-repo",
+    type=int,
+    default=GithubConfig.default_max_pages_per_repo,
+    help=(
+        "Maximum number of commit pages to fetch per repository. "
+        f"Defaults to {GithubConfig.default_max_pages_per_repo}."
+    ),
+)
+@click.option(
+    "--per-page",
+    type=int,
+    default=GithubConfig.default_per_page,
+    help=f"GitHub commit page size. Defaults to {GithubConfig.default_per_page}.",
+)
+@click.option(
+    "--retry-count",
+    type=int,
+    default=GithubConfig.default_retry_count,
+    help=f"Number of retries for repository metadata and commit requests. Defaults to {GithubConfig.default_retry_count}.",
+)
+def fetch_commits_cmd(
+    run_id: str | None,
+    repos_file: Path | None,
+    output_root: Path,
+    mode: str,
+    max_pages_per_repo: int,
+    per_page: int,
+    retry_count: int,
+) -> None:
+    """
+    Fetch commit history for repositories collected in phase 1.
+
+    This command reads a list of repositories and downloads their commit history 
+    using the GitHub API.
+    """
+    if not run_id and not repos_file:
+        raise click.UsageError("Must provide either --run-id or --repos-file")
+    if run_id and repos_file:
+        raise click.UsageError("Cannot provide both --run-id and --repos-file")
+
+    options = FetchCommitsOptions(
+        output_root=output_root,
+        run_id=run_id,
+        repos_file=repos_file,
+        mode=mode,
+        per_page=per_page,
+        max_pages_per_repo=max_pages_per_repo,
+        retry_count=retry_count,
+    )
+    result = fetch_commits(options)
+    click.echo(f"Run ID: {result['run_id']}")
+    click.echo(f"Processed repositories: {result['processed_repositories']}")
+    click.echo(f"Completed repositories: {result['completed_repositories']}")
+    click.echo(f"Warning repositories: {result['warning_repositories']}")
+    click.echo(f"Failed repositories: {result['failed_repositories']}")
+    click.echo(f"Commits file: {result['commits_path']}")
+    click.echo(f"Status file: {result['status_path']}")
+    click.echo(f"Manifest file: {result['manifest_path']}")


 if __name__ == "__main__":
-    raise SystemExit(main())
+    main()
--- a/src/github_datapipe/core/config.py
+++ b/src/github_datapipe/core/config.py
@@ -1,3 +1,10 @@
+"""
+Configuration settings for the GitHub data pipeline.
+
+This module defines the GithubConfig dataclass, which holds default values
+and endpoints for the GitHub API and the local execution environment.
+"""
+
 from __future__ import annotations

 from dataclasses import dataclass
@@ -6,6 +13,21 @@ from pathlib import Path

@dataclass(frozen=True)
 class GithubConfig:
+    """
+    Centralized configuration for the GitHub data pipeline.
+
+    Attributes:
+        base_url: The root URL for the GitHub API.
+        api_version: The version of the GitHub API to use.
+        search_repositories_endpoint: The endpoint for searching repositories.
+        default_repo_count: Default number of repositories to sample.
+        default_per_page: Default results per page for API requests.
+        default_output_root: Default root directory for output files.
+        default_query: Default search query for repository sampling.
+        default_max_pages_per_repo: Default maximum pages of commits to fetch per repository.
+        default_retry_count: Default number of retries for failed API requests.
+        user_agent: User agent string to send with GitHub API requests.
+    """
    base_url: str = "https://api.github.com"
    api_version: str = "2022-11-28"
    search_repositories_endpoint: str = "/search/repositories"
@@ -18,4 +40,5 @@ class GithubConfig:
    user_agent: str = "github-datapipe/0.1.0"


+# The root directory of the project, used for resolving relative paths.
 ROOT_DIR = Path(__file__).resolve().parents[2]
--- a/src/github_datapipe/core/github_api.py
+++ b/src/github_datapipe/core/github_api.py
@@ -1,3 +1,10 @@
+"""
+GitHub API client for the data pipeline.
+
+This module provides a wrapper around the GitHub REST API, handling authentication,
+headers, and common requests such as searching repositories and listing commits.
+"""
+
 from __future__ import annotations

 from typing import Any
@@ -12,7 +19,20 @@ class GithubApiError(RuntimeError):


 class GithubApiClient:
+    """
+    A client for interacting with the GitHub REST API.
+
+    Handles session management, authentication headers, and provides methods
+    for specific API endpoints.
+    """
    def __init__(self, token: str, session: requests.Session | None = None) -> None:
+        """
+        Initialize the client with a GitHub token.
+
+        Args:
+            token: GitHub Personal Access Token (PAT).
+            session: Optional pre-configured requests session.
+        """
        self._session = session or requests.Session()
        self._session.headers.update(
            {
@@ -24,21 +44,63 @@ class GithubApiClient:
        )

    def search_repositories(self, query: str, page: int, per_page: int) -> dict[str, Any]:
-        return self._get_json(
+        """
+        Search for repositories matching a query.
+
+        Args:
+            query: Raw GitHub search query string.
+            page: Page number to retrieve.
+            per_page: Number of items per page.
+
+        Returns:
+            The JSON response payload from GitHub.
+        """
+        return self._get_api_resp(
            GithubConfig.search_repositories_endpoint,
            params={"q": query, "page": page, "per_page": per_page},
        )

    def get_repository(self, full_name: str) -> dict[str, Any]:
-        return self._get_json(f"/repos/{full_name}")
+        """
+        Retrieve metadata for a single repository.
+
+        Args:
+            full_name: The full name of the repository (e.g., 'owner/repo').
+
+        Returns:
+            The JSON response payload from GitHub.
+        """
+        return self._get_api_resp(f"/repos/{full_name}")

    def list_commits(self, full_name: str, branch: str, page: int, per_page: int) -> list[dict[str, Any]]:
-        return self._get_json(
+        """
+        List commits for a specific repository and branch.
+
+        Args:
+            full_name: The full name of the repository.
+            branch: The branch name (or SHA) to list commits from.
+            page: Page number to retrieve.
+            per_page: Number of items per page.
+
+        Returns:
+            A list of commit data payloads.
+        """
+        return self._get_api_resp(
            f"/repos/{full_name}/commits",
            params={"sha": branch, "page": page, "per_page": per_page},
        )

-    def _get_json(self, path: str, params: dict[str, Any] | None = None) -> Any:
+    def _get_api_resp(self, path: str, params: dict[str, Any] | None = None) -> Any:
+        """
+        Perform a GET request and return the JSON response.
+
+        Args:
+            path: The API path (excluding the base URL).
+            params: Optional query parameters.
+
+        Raises:
+            GithubApiError: If the request fails or returns an error status code.
+        """
        response = self._session.get(
            f"{GithubConfig.base_url}{path}",
            params=params,
--- a/src/github_datapipe/core/io.py
+++ b/src/github_datapipe/core/io.py
@@ -1,3 +1,10 @@
+"""
+Input/Output utilities for the data pipeline.
+
+Provides helper functions for reading and writing JSON and JSONL (JSON Lines) files,
+including support for appending to JSONL files.
+"""
+
 from __future__ import annotations

 import json
@@ -6,11 +13,13 @@ from typing import Any, Iterable


 def write_json(path: Path, payload: Any) -> None:
+    """Write a Python object to a file as formatted JSON."""
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(payload, indent=2), encoding="utf-8")


 def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> None:
+    """Write an iterable of dictionaries to a file in JSONL format."""
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as handle:
        for row in rows:
@@ -19,6 +28,12 @@ def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> None:


 def append_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> int:
+    """
+    Append an iterable of dictionaries to an existing JSONL file.
+
+    Returns:
+        The number of rows successfully appended.
+    """
    path.parent.mkdir(parents=True, exist_ok=True)
    count = 0
    with path.open("a", encoding="utf-8") as handle:
@@ -30,10 +45,16 @@ def append_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> int:


 def read_json(path: Path) -> Any:
+    """Read and parse a JSON file."""
    return json.loads(path.read_text(encoding="utf-8"))


 def read_jsonl(path: Path) -> list[dict[str, Any]]:
+    """
+    Read a JSONL file and return a list of dictionaries.
+
+    Returns an empty list if the file does not exist.
+    """
    if not path.exists():
        return []
    rows: list[dict[str, Any]] = []
--- a/src/github_datapipe/core/runtime.py
+++ b/src/github_datapipe/core/runtime.py
@@ -1,3 +1,10 @@
+"""
+Runtime environment and utility helpers.
+
+Handles loading environment variables (like GitHub tokens), generating
+unique run identifiers, and providing consistent UTC timestamps.
+"""
+
 from __future__ import annotations

 import os
@@ -10,6 +17,12 @@ load_dotenv()


 def require_github_token() -> str:
+    """
+    Ensure a GitHub token is available in the environment.
+
+    Raises:
+        ValueError: If the `github_token` environment variable is missing.
+    """
    github_token = os.getenv("github_token")
    if not github_token:
        raise ValueError("GitHub token is not available in .env as `github_token`.")
@@ -17,8 +30,10 @@ def require_github_token() -> str:


 def build_run_id() -> str:
+    """Generate a unique run ID based on the current timestamp and a random suffix."""
    return f"run-{datetime.now(tz=UTC).strftime('%Y%m%dT%H%M%SZ')}-{uuid4().hex[:8]}"


 def utc_now() -> str:
+    """Return the current UTC time in ISO 8601 format."""
    return datetime.now(tz=UTC).isoformat()
--- a/src/github_datapipe/phases/phase1_repository_sampling/service.py
+++ b/src/github_datapipe/phases/phase1_repository_sampling/service.py
@@ -1,3 +1,10 @@
+"""
+Phase 1: Repository Sampling Service.
+
+This service is responsible for searching GitHub for repositories based on a query,
+filtering out previously seen repositories (if requested), and saving the results.
+"""
+
 from __future__ import annotations

 from dataclasses import dataclass
@@ -12,6 +19,7 @@ from github_datapipe.core.runtime import build_run_id, require_github_token, utc

@dataclass(frozen=True)
 class SampleReposOptions:
+    """Options for the repository sampling phase."""
    count: int = GithubConfig.default_repo_count
    output_root: Path = Path(GithubConfig.default_output_root)
    query: str = GithubConfig.default_query
@@ -21,10 +29,23 @@ class SampleReposOptions:


 def resolve_query(query_override: str | None) -> str:
+    """Resolve the search query, using the override if provided, else the default."""
    return query_override.strip() if query_override else GithubConfig.default_query


 def sample_repositories(options: SampleReposOptions) -> dict[str, Any]:
+    """
+    Execute Phase 1: Sample repositories from GitHub.
+
+    Searches for repositories, handles pagination and deduplication, and saves
+    the collected repository data and a run manifest.
+
+    Args:
+        options: Configuration options for sampling.
+
+    Returns:
+        A dictionary containing the run ID and paths to generated files.
+    """
    token = require_github_token()
    if options.count <= 0:
        raise ValueError("`count` must be greater than 0.")
@@ -121,6 +142,7 @@ def normalize_repo_record(
    sample_page: int,
    sampled_at: str,
 ) -> dict[str, Any]:
+    """Normalize a raw GitHub repository payload into our internal schema."""
    return {
        "run_id": run_id,
        "repo_id": repo["id"],
@@ -142,6 +164,7 @@ def normalize_repo_record(


 def load_seen_repo_ids(path: Path) -> set[int]:
+    """Load the set of repository IDs that have already been sampled."""
    if not path.exists():
        return set()
    payload = read_json(path)
--- a/src/github_datapipe/phases/phase2_commit_ingestion/service.py
+++ b/src/github_datapipe/phases/phase2_commit_ingestion/service.py
@@ -1,3 +1,10 @@
+"""
+Phase 2: Commit Ingestion Service.
+
+This service reads a list of repositories (usually from Phase 1), and for each one,
+fetches its commit history from the GitHub API and saves it in JSONL format.
+"""
+
 from __future__ import annotations

 from dataclasses import dataclass
@@ -12,6 +19,7 @@ from github_datapipe.core.runtime import require_github_token, utc_now

@dataclass(frozen=True)
 class FetchCommitsOptions:
+    """Options for the commit ingestion phase."""
    output_root: Path = Path(GithubConfig.default_output_root)
    run_id: str | None = None
    repos_file: Path | None = None
@@ -22,6 +30,18 @@ class FetchCommitsOptions:


 def fetch_commits(options: FetchCommitsOptions) -> dict[str, Any]:
+    """
+    Execute Phase 2: Fetch commit history for multiple repositories.
+
+    Iterates through repositories, downloads commits, handles errors and retries,
+    and updates the run manifest and status records.
+
+    Args:
+        options: Configuration options for ingestion.
+
+    Returns:
+        A dictionary containing processing statistics and file paths.
+    """
    token = require_github_token()
    if options.max_pages_per_repo <= 0:
        raise ValueError("`max_pages_per_repo` must be greater than 0.")
@@ -123,6 +143,7 @@ def fetch_commits(options: FetchCommitsOptions) -> dict[str, Any]:


 def resolve_repo_input(options: FetchCommitsOptions, output_root: Path) -> tuple[Path, str]:
+    """Resolve the input repository list from either a run ID or a specific file path."""
    if options.run_id is not None:
        return (
            output_root / options.run_id / "phase1_repository_sampling" / "repos.jsonl",
@@ -145,6 +166,7 @@ def process_repository(
    max_pages_per_repo: int,
    retry_count: int,
 ) -> tuple[dict[str, Any], list[dict[str, Any]]]:
+    """Fetch all commits for a single repository, handling pagination and errors."""
    full_name = repo["full_name"]
    fetched_at = utc_now()

@@ -230,6 +252,7 @@ def normalize_commit_record(
    page_number: int,
    fetched_at: str,
 ) -> dict[str, Any]:
+    """Normalize a raw GitHub commit payload into our internal schema."""
    commit = commit_payload["commit"]
    author = commit.get("author") or {}
    committer = commit.get("committer") or {}
@@ -258,6 +281,7 @@ def normalize_commit_record(


 def with_retries(operation: Any, retry_count: int) -> Any:
+    """Execute a function with a specified number of retries on GitHubApiError."""
    last_error: Exception | None = None
    for _ in range(retry_count + 1):
        try:
@@ -269,6 +293,7 @@ def with_retries(operation: Any, retry_count: int) -> Any:


 def load_status_index(path: Path) -> dict[int, str]:
+    """Load an index of repository processing statuses to support resumption."""
    rows = read_jsonl(path)
    latest_by_repo: dict[int, str] = {}
    for row in rows:
@@ -277,6 +302,7 @@ def load_status_index(path: Path) -> dict[int, str]:


 def dedupe_status_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Deduplicate status records, keeping only the latest record for each repository."""
    latest_by_repo: dict[int, dict[str, Any]] = {}
    order: list[int] = []
    for row in rows:
@@ -288,6 +314,7 @@ def dedupe_status_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:


 def count_jsonl_rows(path: Path) -> int:
+    """Return the number of lines in a JSONL file."""
    if not path.exists():
        return 0
    return len(path.read_text(encoding="utf-8").splitlines())