diff --git a/README.md b/README.md index f1dc622..1153aa9 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,21 @@ From the project root, install the local environment with: poetry install ``` +## CLI Help + +To see the available commands and their descriptions, run: + +```powershell +poetry run github-datapipe --help +``` + +To see the available arguments and options for a specific command, use the `--help` flag with that command: + +```powershell +poetry run github-datapipe sample-repos --help +poetry run github-datapipe fetch-commits --help +``` + ## Run Phase 1 The default phase 1 command collects 10 repositories using the default search query from `src/github_datapipe/core/config.py`. diff --git a/poetry.lock b/poetry.lock index 7123254..cf5c8e8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -151,6 +151,21 @@ files = [ {file = "charset_normalizer-3.4.7.tar.gz", hash = "sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5"}, ] +[[package]] +name = "click" +version = "8.3.3" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "click-8.3.3-py3-none-any.whl", hash = "sha256:a2bf429bb3033c89fa4936ffb35d5cb471e3719e1f3c8a7c3fff0b8314305613"}, + {file = "click-8.3.3.tar.gz", hash = "sha256:398329ad4837b2ff7cbe1dd166a4c0f8900c3ca3a218de04466f38f6497f18a2"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + [[package]] name = "colorama" version = "0.4.6" @@ -158,7 +173,7 @@ description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" groups = ["main"] -markers = "sys_platform == \"win32\"" +markers = "sys_platform == \"win32\" or platform_system == \"Windows\"" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, @@ -606,4 +621,4 @@ zstd = ["backports-zstd (>=1.0.0) ; python_version < \"3.14\""] [metadata] lock-version = "2.1" python-versions = ">=3.12" -content-hash = "3c8765a0ae32ec30eec764e892bef0c0e87524d351f88387e91b78c4b604d112" +content-hash = "43a523cd40c7fc6a64072c377740862a41c1fd19a2be575974aa319838191e02" diff --git a/pyproject.toml b/pyproject.toml index 3911a03..b7fc39f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,8 @@ dependencies = [ "pyarrow (>=23.0.1,<24.0.0)", "python-dotenv (>=1.2.2,<2.0.0)", "pytest (>=9.0.3,<10.0.0)", - "requests-mock (>=1.12.1,<2.0.0)" + "requests-mock (>=1.12.1,<2.0.0)", + "click (>=8.3.3,<9.0.0)" ] [project.scripts] diff --git a/src/github_datapipe/cli.py b/src/github_datapipe/cli.py index 5f499c7..84d5c4f 100644 --- a/src/github_datapipe/cli.py +++ b/src/github_datapipe/cli.py @@ -1,6 +1,13 @@ +""" +Command-line interface for the GitHub data pipeline. + +This module provides the entry point for the CLI, allowing users to run different +phases of the pipeline, such as repository sampling and commit ingestion. +""" + from __future__ import annotations -import argparse +import click from pathlib import Path from github_datapipe.core.config import GithubConfig @@ -17,149 +24,172 @@ from github_datapipe.phases.phase2_commit_ingestion.service import ( ) -def build_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(description="GitHub data pipeline CLI") - subparsers = parser.add_subparsers(dest="command", required=True) +@click.group() +def main() -> None: + """ + GitHub data pipeline CLI. - sample_parser = subparsers.add_parser( - "sample-repos", - help="Collect repositories from GitHub Search and save phase 1 outputs.", - ) - sample_parser.add_argument( - "--count", - type=int, - default=GithubConfig.default_repo_count, - help=f"Number of repositories to sample. Defaults to {GithubConfig.default_repo_count}.", - ) - sample_parser.add_argument( - "--query", - type=str, - default=None, - help="Optional raw GitHub repository search query. Replaces config defaults when provided.", - ) - sample_parser.add_argument( - "--output-root", - type=Path, - default=Path(GithubConfig.default_output_root), - help=f"Directory where run outputs are stored. Defaults to `{GithubConfig.default_output_root}`.", - ) - sample_parser.add_argument( - "--mode", - choices=("append-deduped", "fresh"), - default="append-deduped", - help="Whether to dedupe against the persisted seen-repo index or start a fresh phase-1 run.", - ) - sample_parser.add_argument( - "--per-page", - type=int, - default=GithubConfig.default_per_page, - help=f"GitHub Search page size. Defaults to {GithubConfig.default_per_page}.", - ) - sample_parser.add_argument( - "--run-id", - type=str, - default=None, - help="Optional run identifier. If omitted, a run id is generated automatically.", - ) - - commit_parser = subparsers.add_parser( - "fetch-commits", - help="Fetch commit history for repositories collected in phase 1.", - ) - commit_input = commit_parser.add_mutually_exclusive_group(required=True) - commit_input.add_argument( - "--run-id", - type=str, - help="Run identifier whose phase 1 repository dataset should be consumed.", - ) - commit_input.add_argument( - "--repos-file", - type=Path, - help="Path to a phase 1 repos.jsonl file.", - ) - commit_parser.add_argument( - "--output-root", - type=Path, - default=Path(GithubConfig.default_output_root), - help=f"Directory where run outputs are stored. Defaults to `{GithubConfig.default_output_root}`.", - ) - commit_parser.add_argument( - "--mode", - choices=("refresh", "resume"), - default="refresh", - help="Whether to fetch all repositories from scratch or skip those already marked complete.", - ) - commit_parser.add_argument( - "--max-pages-per-repo", - type=int, - default=GithubConfig.default_max_pages_per_repo, - help=( - "Maximum number of commit pages to fetch per repository. " - f"Defaults to {GithubConfig.default_max_pages_per_repo}." - ), - ) - commit_parser.add_argument( - "--per-page", - type=int, - default=GithubConfig.default_per_page, - help=f"GitHub commit page size. Defaults to {GithubConfig.default_per_page}.", - ) - commit_parser.add_argument( - "--retry-count", - type=int, - default=GithubConfig.default_retry_count, - help=f"Number of retries for repository metadata and commit requests. Defaults to {GithubConfig.default_retry_count}.", - ) - - return parser + Use the subcommands to run different phases of the data extraction process. + """ + pass -def main() -> int: - parser = build_parser() - args = parser.parse_args() - if args.command == "sample-repos": - options = SampleReposOptions( - count=args.count, - output_root=args.output_root, - query=resolve_query(args.query), - per_page=args.per_page, - mode=args.mode, - run_id=args.run_id, - ) - result = sample_repositories(options) - print(f"Run ID: {result['run_id']}") - print(f"Collected repositories: {result['count_collected']}") - print(f"Repositories file: {result['repos_path']}") - print(f"Manifest file: {result['manifest_path']}") - if result["seen_index_path"] is not None: - print(f"Seen repo index: {result['seen_index_path']}") - return 0 +""" - if args.command == "fetch-commits": - options = FetchCommitsOptions( - output_root=args.output_root, - run_id=args.run_id, - repos_file=args.repos_file, - mode=args.mode, - per_page=args.per_page, - max_pages_per_repo=args.max_pages_per_repo, - retry_count=args.retry_count, - ) - result = fetch_commits(options) - print(f"Run ID: {result['run_id']}") - print(f"Processed repositories: {result['processed_repositories']}") - print(f"Completed repositories: {result['completed_repositories']}") - print(f"Warning repositories: {result['warning_repositories']}") - print(f"Failed repositories: {result['failed_repositories']}") - print(f"Commits file: {result['commits_path']}") - print(f"Status file: {result['status_path']}") - print(f"Manifest file: {result['manifest_path']}") - return 0 +""" +@main.command(name="sample-repos") +@click.option( + "--count", + type=int, + default=GithubConfig.default_repo_count, + help=f"Number of repositories to sample. Defaults to {GithubConfig.default_repo_count}.", +) +@click.option( + "--query", + type=str, + default=GithubConfig.default_query, + help="Optional raw GitHub repository search query. Replaces config defaults when provided.", +) +@click.option( + "--output-root", + type=click.Path(path_type=Path), + default=Path(GithubConfig.default_output_root), + help=f"Directory where run outputs are stored. Defaults to `{GithubConfig.default_output_root}`.", +) +@click.option( + "--mode", + type=click.Choice(["append-deduped", "fresh"]), + default="fresh", + help="Whether to dedupe against the persisted seen-repo index or start a fresh phase-1 run.", +) +@click.option( + "--per-page", + type=int, + default=GithubConfig.default_per_page, + help=f"GitHub Search page size. Defaults to {GithubConfig.default_per_page}.", +) +@click.option( + "--run-id", + type=str, + default=None, + help="Optional run identifier. If omitted, a run id is generated automatically.", +) +def sample_repos( + count: int, + query: str | None, + output_root: Path, + mode: str, + per_page: int, + run_id: str | None, +) -> None: + """ + Collect repositories from GitHub Search and save phase 1 outputs. - parser.error(f"Unsupported command: {args.command}") - return 1 + This command searches for repositories matching criteria and saves their metadata + to a JSONL file for further processing. + """ + options = SampleReposOptions( + count=count, + output_root=output_root, + query=resolve_query(query), + per_page=per_page, + mode=mode, + run_id=run_id, + ) + result = sample_repositories(options) + click.echo(f"Run ID: {result['run_id']}") + click.echo(f"Collected repositories: {result['count_collected']}") + click.echo(f"Repositories file: {result['repos_path']}") + click.echo(f"Manifest file: {result['manifest_path']}") + if result["seen_index_path"] is not None: + click.echo(f"Seen repo index: {result['seen_index_path']}") + + +@main.command(name="fetch-commits") +@click.option( + "--run-id", + type=str, + required = True, + help="Run identifier whose phase 1 repository dataset should be consumed.", +) +@click.option( + "--repos-file", + type=click.Path(exists=True, path_type=Path), + help="Path to a phase 1 repos.jsonl file.", +) +@click.option( + "--output-root", + type=click.Path(path_type=Path), + default=Path(GithubConfig.default_output_root), + help=f"Directory where run outputs are stored. Defaults to `{GithubConfig.default_output_root}`.", +) +@click.option( + "--mode", + type=click.Choice(["refresh", "resume"]), + default="refresh", + help="Whether to fetch all repositories from scratch or skip those already marked complete.", +) +@click.option( + "--max-pages-per-repo", + type=int, + default=GithubConfig.default_max_pages_per_repo, + help=( + "Maximum number of commit pages to fetch per repository. " + f"Defaults to {GithubConfig.default_max_pages_per_repo}." + ), +) +@click.option( + "--per-page", + type=int, + default=GithubConfig.default_per_page, + help=f"GitHub commit page size. Defaults to {GithubConfig.default_per_page}.", +) +@click.option( + "--retry-count", + type=int, + default=GithubConfig.default_retry_count, + help=f"Number of retries for repository metadata and commit requests. Defaults to {GithubConfig.default_retry_count}.", +) +def fetch_commits_cmd( + run_id: str | None, + repos_file: Path | None, + output_root: Path, + mode: str, + max_pages_per_repo: int, + per_page: int, + retry_count: int, +) -> None: + """ + Fetch commit history for repositories collected in phase 1. + + This command reads a list of repositories and downloads their commit history + using the GitHub API. + """ + if not run_id and not repos_file: + raise click.UsageError("Must provide either --run-id or --repos-file") + if run_id and repos_file: + raise click.UsageError("Cannot provide both --run-id and --repos-file") + + options = FetchCommitsOptions( + output_root=output_root, + run_id=run_id, + repos_file=repos_file, + mode=mode, + per_page=per_page, + max_pages_per_repo=max_pages_per_repo, + retry_count=retry_count, + ) + result = fetch_commits(options) + click.echo(f"Run ID: {result['run_id']}") + click.echo(f"Processed repositories: {result['processed_repositories']}") + click.echo(f"Completed repositories: {result['completed_repositories']}") + click.echo(f"Warning repositories: {result['warning_repositories']}") + click.echo(f"Failed repositories: {result['failed_repositories']}") + click.echo(f"Commits file: {result['commits_path']}") + click.echo(f"Status file: {result['status_path']}") + click.echo(f"Manifest file: {result['manifest_path']}") if __name__ == "__main__": - raise SystemExit(main()) + main() diff --git a/src/github_datapipe/core/config.py b/src/github_datapipe/core/config.py index 0ca16d2..72ab1de 100644 --- a/src/github_datapipe/core/config.py +++ b/src/github_datapipe/core/config.py @@ -1,3 +1,10 @@ +""" +Configuration settings for the GitHub data pipeline. + +This module defines the GithubConfig dataclass, which holds default values +and endpoints for the GitHub API and the local execution environment. +""" + from __future__ import annotations from dataclasses import dataclass @@ -6,6 +13,21 @@ from pathlib import Path @dataclass(frozen=True) class GithubConfig: + """ + Centralized configuration for the GitHub data pipeline. + + Attributes: + base_url: The root URL for the GitHub API. + api_version: The version of the GitHub API to use. + search_repositories_endpoint: The endpoint for searching repositories. + default_repo_count: Default number of repositories to sample. + default_per_page: Default results per page for API requests. + default_output_root: Default root directory for output files. + default_query: Default search query for repository sampling. + default_max_pages_per_repo: Default maximum pages of commits to fetch per repository. + default_retry_count: Default number of retries for failed API requests. + user_agent: User agent string to send with GitHub API requests. + """ base_url: str = "https://api.github.com" api_version: str = "2022-11-28" search_repositories_endpoint: str = "/search/repositories" @@ -18,4 +40,5 @@ class GithubConfig: user_agent: str = "github-datapipe/0.1.0" +# The root directory of the project, used for resolving relative paths. ROOT_DIR = Path(__file__).resolve().parents[2] diff --git a/src/github_datapipe/core/github_api.py b/src/github_datapipe/core/github_api.py index 9f91df3..caf6ffe 100644 --- a/src/github_datapipe/core/github_api.py +++ b/src/github_datapipe/core/github_api.py @@ -1,3 +1,10 @@ +""" +GitHub API client for the data pipeline. + +This module provides a wrapper around the GitHub REST API, handling authentication, +headers, and common requests such as searching repositories and listing commits. +""" + from __future__ import annotations from typing import Any @@ -12,7 +19,20 @@ class GithubApiError(RuntimeError): class GithubApiClient: + """ + A client for interacting with the GitHub REST API. + + Handles session management, authentication headers, and provides methods + for specific API endpoints. + """ def __init__(self, token: str, session: requests.Session | None = None) -> None: + """ + Initialize the client with a GitHub token. + + Args: + token: GitHub Personal Access Token (PAT). + session: Optional pre-configured requests session. + """ self._session = session or requests.Session() self._session.headers.update( { @@ -24,21 +44,63 @@ class GithubApiClient: ) def search_repositories(self, query: str, page: int, per_page: int) -> dict[str, Any]: - return self._get_json( + """ + Search for repositories matching a query. + + Args: + query: Raw GitHub search query string. + page: Page number to retrieve. + per_page: Number of items per page. + + Returns: + The JSON response payload from GitHub. + """ + return self._get_api_resp( GithubConfig.search_repositories_endpoint, params={"q": query, "page": page, "per_page": per_page}, ) def get_repository(self, full_name: str) -> dict[str, Any]: - return self._get_json(f"/repos/{full_name}") + """ + Retrieve metadata for a single repository. + + Args: + full_name: The full name of the repository (e.g., 'owner/repo'). + + Returns: + The JSON response payload from GitHub. + """ + return self._get_api_resp(f"/repos/{full_name}") def list_commits(self, full_name: str, branch: str, page: int, per_page: int) -> list[dict[str, Any]]: - return self._get_json( + """ + List commits for a specific repository and branch. + + Args: + full_name: The full name of the repository. + branch: The branch name (or SHA) to list commits from. + page: Page number to retrieve. + per_page: Number of items per page. + + Returns: + A list of commit data payloads. + """ + return self._get_api_resp( f"/repos/{full_name}/commits", params={"sha": branch, "page": page, "per_page": per_page}, ) - def _get_json(self, path: str, params: dict[str, Any] | None = None) -> Any: + def _get_api_resp(self, path: str, params: dict[str, Any] | None = None) -> Any: + """ + Perform a GET request and return the JSON response. + + Args: + path: The API path (excluding the base URL). + params: Optional query parameters. + + Raises: + GithubApiError: If the request fails or returns an error status code. + """ response = self._session.get( f"{GithubConfig.base_url}{path}", params=params, diff --git a/src/github_datapipe/core/io.py b/src/github_datapipe/core/io.py index 55e6f6a..bf29f21 100644 --- a/src/github_datapipe/core/io.py +++ b/src/github_datapipe/core/io.py @@ -1,3 +1,10 @@ +""" +Input/Output utilities for the data pipeline. + +Provides helper functions for reading and writing JSON and JSONL (JSON Lines) files, +including support for appending to JSONL files. +""" + from __future__ import annotations import json @@ -6,11 +13,13 @@ from typing import Any, Iterable def write_json(path: Path, payload: Any) -> None: + """Write a Python object to a file as formatted JSON.""" path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(payload, indent=2), encoding="utf-8") def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> None: + """Write an iterable of dictionaries to a file in JSONL format.""" path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8") as handle: for row in rows: @@ -19,6 +28,12 @@ def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> None: def append_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> int: + """ + Append an iterable of dictionaries to an existing JSONL file. + + Returns: + The number of rows successfully appended. + """ path.parent.mkdir(parents=True, exist_ok=True) count = 0 with path.open("a", encoding="utf-8") as handle: @@ -30,10 +45,16 @@ def append_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> int: def read_json(path: Path) -> Any: + """Read and parse a JSON file.""" return json.loads(path.read_text(encoding="utf-8")) def read_jsonl(path: Path) -> list[dict[str, Any]]: + """ + Read a JSONL file and return a list of dictionaries. + + Returns an empty list if the file does not exist. + """ if not path.exists(): return [] rows: list[dict[str, Any]] = [] diff --git a/src/github_datapipe/core/runtime.py b/src/github_datapipe/core/runtime.py index 4a33100..2a2fb25 100644 --- a/src/github_datapipe/core/runtime.py +++ b/src/github_datapipe/core/runtime.py @@ -1,3 +1,10 @@ +""" +Runtime environment and utility helpers. + +Handles loading environment variables (like GitHub tokens), generating +unique run identifiers, and providing consistent UTC timestamps. +""" + from __future__ import annotations import os @@ -10,6 +17,12 @@ load_dotenv() def require_github_token() -> str: + """ + Ensure a GitHub token is available in the environment. + + Raises: + ValueError: If the `github_token` environment variable is missing. + """ github_token = os.getenv("github_token") if not github_token: raise ValueError("GitHub token is not available in .env as `github_token`.") @@ -17,8 +30,10 @@ def require_github_token() -> str: def build_run_id() -> str: + """Generate a unique run ID based on the current timestamp and a random suffix.""" return f"run-{datetime.now(tz=UTC).strftime('%Y%m%dT%H%M%SZ')}-{uuid4().hex[:8]}" def utc_now() -> str: + """Return the current UTC time in ISO 8601 format.""" return datetime.now(tz=UTC).isoformat() diff --git a/src/github_datapipe/phases/phase1_repository_sampling/service.py b/src/github_datapipe/phases/phase1_repository_sampling/service.py index 8538ad2..9d6c9a6 100644 --- a/src/github_datapipe/phases/phase1_repository_sampling/service.py +++ b/src/github_datapipe/phases/phase1_repository_sampling/service.py @@ -1,3 +1,10 @@ +""" +Phase 1: Repository Sampling Service. + +This service is responsible for searching GitHub for repositories based on a query, +filtering out previously seen repositories (if requested), and saving the results. +""" + from __future__ import annotations from dataclasses import dataclass @@ -12,6 +19,7 @@ from github_datapipe.core.runtime import build_run_id, require_github_token, utc @dataclass(frozen=True) class SampleReposOptions: + """Options for the repository sampling phase.""" count: int = GithubConfig.default_repo_count output_root: Path = Path(GithubConfig.default_output_root) query: str = GithubConfig.default_query @@ -21,10 +29,23 @@ class SampleReposOptions: def resolve_query(query_override: str | None) -> str: + """Resolve the search query, using the override if provided, else the default.""" return query_override.strip() if query_override else GithubConfig.default_query def sample_repositories(options: SampleReposOptions) -> dict[str, Any]: + """ + Execute Phase 1: Sample repositories from GitHub. + + Searches for repositories, handles pagination and deduplication, and saves + the collected repository data and a run manifest. + + Args: + options: Configuration options for sampling. + + Returns: + A dictionary containing the run ID and paths to generated files. + """ token = require_github_token() if options.count <= 0: raise ValueError("`count` must be greater than 0.") @@ -121,6 +142,7 @@ def normalize_repo_record( sample_page: int, sampled_at: str, ) -> dict[str, Any]: + """Normalize a raw GitHub repository payload into our internal schema.""" return { "run_id": run_id, "repo_id": repo["id"], @@ -142,6 +164,7 @@ def normalize_repo_record( def load_seen_repo_ids(path: Path) -> set[int]: + """Load the set of repository IDs that have already been sampled.""" if not path.exists(): return set() payload = read_json(path) diff --git a/src/github_datapipe/phases/phase2_commit_ingestion/service.py b/src/github_datapipe/phases/phase2_commit_ingestion/service.py index 420a5c5..d2c3cbe 100644 --- a/src/github_datapipe/phases/phase2_commit_ingestion/service.py +++ b/src/github_datapipe/phases/phase2_commit_ingestion/service.py @@ -1,3 +1,10 @@ +""" +Phase 2: Commit Ingestion Service. + +This service reads a list of repositories (usually from Phase 1), and for each one, +fetches its commit history from the GitHub API and saves it in JSONL format. +""" + from __future__ import annotations from dataclasses import dataclass @@ -12,6 +19,7 @@ from github_datapipe.core.runtime import require_github_token, utc_now @dataclass(frozen=True) class FetchCommitsOptions: + """Options for the commit ingestion phase.""" output_root: Path = Path(GithubConfig.default_output_root) run_id: str | None = None repos_file: Path | None = None @@ -22,6 +30,18 @@ class FetchCommitsOptions: def fetch_commits(options: FetchCommitsOptions) -> dict[str, Any]: + """ + Execute Phase 2: Fetch commit history for multiple repositories. + + Iterates through repositories, downloads commits, handles errors and retries, + and updates the run manifest and status records. + + Args: + options: Configuration options for ingestion. + + Returns: + A dictionary containing processing statistics and file paths. + """ token = require_github_token() if options.max_pages_per_repo <= 0: raise ValueError("`max_pages_per_repo` must be greater than 0.") @@ -123,6 +143,7 @@ def fetch_commits(options: FetchCommitsOptions) -> dict[str, Any]: def resolve_repo_input(options: FetchCommitsOptions, output_root: Path) -> tuple[Path, str]: + """Resolve the input repository list from either a run ID or a specific file path.""" if options.run_id is not None: return ( output_root / options.run_id / "phase1_repository_sampling" / "repos.jsonl", @@ -145,6 +166,7 @@ def process_repository( max_pages_per_repo: int, retry_count: int, ) -> tuple[dict[str, Any], list[dict[str, Any]]]: + """Fetch all commits for a single repository, handling pagination and errors.""" full_name = repo["full_name"] fetched_at = utc_now() @@ -230,6 +252,7 @@ def normalize_commit_record( page_number: int, fetched_at: str, ) -> dict[str, Any]: + """Normalize a raw GitHub commit payload into our internal schema.""" commit = commit_payload["commit"] author = commit.get("author") or {} committer = commit.get("committer") or {} @@ -258,6 +281,7 @@ def normalize_commit_record( def with_retries(operation: Any, retry_count: int) -> Any: + """Execute a function with a specified number of retries on GitHubApiError.""" last_error: Exception | None = None for _ in range(retry_count + 1): try: @@ -269,6 +293,7 @@ def with_retries(operation: Any, retry_count: int) -> Any: def load_status_index(path: Path) -> dict[int, str]: + """Load an index of repository processing statuses to support resumption.""" rows = read_jsonl(path) latest_by_repo: dict[int, str] = {} for row in rows: @@ -277,6 +302,7 @@ def load_status_index(path: Path) -> dict[int, str]: def dedupe_status_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Deduplicate status records, keeping only the latest record for each repository.""" latest_by_repo: dict[int, dict[str, Any]] = {} order: list[int] = [] for row in rows: @@ -288,6 +314,7 @@ def dedupe_status_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: def count_jsonl_rows(path: Path) -> int: + """Return the number of lines in a JSONL file.""" if not path.exists(): return 0 return len(path.read_text(encoding="utf-8").splitlines())