Refactored to scale cli with click library
This commit is contained in:
15
README.md
15
README.md
@@ -36,6 +36,21 @@ From the project root, install the local environment with:
|
||||
poetry install
|
||||
```
|
||||
|
||||
## CLI Help
|
||||
|
||||
To see the available commands and their descriptions, run:
|
||||
|
||||
```powershell
|
||||
poetry run github-datapipe --help
|
||||
```
|
||||
|
||||
To see the available arguments and options for a specific command, use the `--help` flag with that command:
|
||||
|
||||
```powershell
|
||||
poetry run github-datapipe sample-repos --help
|
||||
poetry run github-datapipe fetch-commits --help
|
||||
```
|
||||
|
||||
## Run Phase 1
|
||||
|
||||
The default phase 1 command collects 10 repositories using the default search query from `src/github_datapipe/core/config.py`.
|
||||
|
||||
19
poetry.lock
generated
19
poetry.lock
generated
@@ -151,6 +151,21 @@ files = [
|
||||
{file = "charset_normalizer-3.4.7.tar.gz", hash = "sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "click"
|
||||
version = "8.3.3"
|
||||
description = "Composable command line interface toolkit"
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "click-8.3.3-py3-none-any.whl", hash = "sha256:a2bf429bb3033c89fa4936ffb35d5cb471e3719e1f3c8a7c3fff0b8314305613"},
|
||||
{file = "click-8.3.3.tar.gz", hash = "sha256:398329ad4837b2ff7cbe1dd166a4c0f8900c3ca3a218de04466f38f6497f18a2"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
version = "0.4.6"
|
||||
@@ -158,7 +173,7 @@ description = "Cross-platform colored terminal text."
|
||||
optional = false
|
||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
||||
groups = ["main"]
|
||||
markers = "sys_platform == \"win32\""
|
||||
markers = "sys_platform == \"win32\" or platform_system == \"Windows\""
|
||||
files = [
|
||||
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
|
||||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||
@@ -606,4 +621,4 @@ zstd = ["backports-zstd (>=1.0.0) ; python_version < \"3.14\""]
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.12"
|
||||
content-hash = "3c8765a0ae32ec30eec764e892bef0c0e87524d351f88387e91b78c4b604d112"
|
||||
content-hash = "43a523cd40c7fc6a64072c377740862a41c1fd19a2be575974aa319838191e02"
|
||||
|
||||
@@ -12,7 +12,8 @@ dependencies = [
|
||||
"pyarrow (>=23.0.1,<24.0.0)",
|
||||
"python-dotenv (>=1.2.2,<2.0.0)",
|
||||
"pytest (>=9.0.3,<10.0.0)",
|
||||
"requests-mock (>=1.12.1,<2.0.0)"
|
||||
"requests-mock (>=1.12.1,<2.0.0)",
|
||||
"click (>=8.3.3,<9.0.0)"
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
@@ -1,6 +1,13 @@
|
||||
"""
|
||||
Command-line interface for the GitHub data pipeline.
|
||||
|
||||
This module provides the entry point for the CLI, allowing users to run different
|
||||
phases of the pipeline, such as repository sampling and commit ingestion.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import click
|
||||
from pathlib import Path
|
||||
|
||||
from github_datapipe.core.config import GithubConfig
|
||||
@@ -17,149 +24,172 @@ from github_datapipe.phases.phase2_commit_ingestion.service import (
|
||||
)
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="GitHub data pipeline CLI")
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
@click.group()
|
||||
def main() -> None:
|
||||
"""
|
||||
GitHub data pipeline CLI.
|
||||
|
||||
sample_parser = subparsers.add_parser(
|
||||
"sample-repos",
|
||||
help="Collect repositories from GitHub Search and save phase 1 outputs.",
|
||||
)
|
||||
sample_parser.add_argument(
|
||||
"--count",
|
||||
type=int,
|
||||
default=GithubConfig.default_repo_count,
|
||||
help=f"Number of repositories to sample. Defaults to {GithubConfig.default_repo_count}.",
|
||||
)
|
||||
sample_parser.add_argument(
|
||||
"--query",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Optional raw GitHub repository search query. Replaces config defaults when provided.",
|
||||
)
|
||||
sample_parser.add_argument(
|
||||
"--output-root",
|
||||
type=Path,
|
||||
default=Path(GithubConfig.default_output_root),
|
||||
help=f"Directory where run outputs are stored. Defaults to `{GithubConfig.default_output_root}`.",
|
||||
)
|
||||
sample_parser.add_argument(
|
||||
"--mode",
|
||||
choices=("append-deduped", "fresh"),
|
||||
default="append-deduped",
|
||||
help="Whether to dedupe against the persisted seen-repo index or start a fresh phase-1 run.",
|
||||
)
|
||||
sample_parser.add_argument(
|
||||
"--per-page",
|
||||
type=int,
|
||||
default=GithubConfig.default_per_page,
|
||||
help=f"GitHub Search page size. Defaults to {GithubConfig.default_per_page}.",
|
||||
)
|
||||
sample_parser.add_argument(
|
||||
"--run-id",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Optional run identifier. If omitted, a run id is generated automatically.",
|
||||
)
|
||||
|
||||
commit_parser = subparsers.add_parser(
|
||||
"fetch-commits",
|
||||
help="Fetch commit history for repositories collected in phase 1.",
|
||||
)
|
||||
commit_input = commit_parser.add_mutually_exclusive_group(required=True)
|
||||
commit_input.add_argument(
|
||||
"--run-id",
|
||||
type=str,
|
||||
help="Run identifier whose phase 1 repository dataset should be consumed.",
|
||||
)
|
||||
commit_input.add_argument(
|
||||
"--repos-file",
|
||||
type=Path,
|
||||
help="Path to a phase 1 repos.jsonl file.",
|
||||
)
|
||||
commit_parser.add_argument(
|
||||
"--output-root",
|
||||
type=Path,
|
||||
default=Path(GithubConfig.default_output_root),
|
||||
help=f"Directory where run outputs are stored. Defaults to `{GithubConfig.default_output_root}`.",
|
||||
)
|
||||
commit_parser.add_argument(
|
||||
"--mode",
|
||||
choices=("refresh", "resume"),
|
||||
default="refresh",
|
||||
help="Whether to fetch all repositories from scratch or skip those already marked complete.",
|
||||
)
|
||||
commit_parser.add_argument(
|
||||
"--max-pages-per-repo",
|
||||
type=int,
|
||||
default=GithubConfig.default_max_pages_per_repo,
|
||||
help=(
|
||||
"Maximum number of commit pages to fetch per repository. "
|
||||
f"Defaults to {GithubConfig.default_max_pages_per_repo}."
|
||||
),
|
||||
)
|
||||
commit_parser.add_argument(
|
||||
"--per-page",
|
||||
type=int,
|
||||
default=GithubConfig.default_per_page,
|
||||
help=f"GitHub commit page size. Defaults to {GithubConfig.default_per_page}.",
|
||||
)
|
||||
commit_parser.add_argument(
|
||||
"--retry-count",
|
||||
type=int,
|
||||
default=GithubConfig.default_retry_count,
|
||||
help=f"Number of retries for repository metadata and commit requests. Defaults to {GithubConfig.default_retry_count}.",
|
||||
)
|
||||
|
||||
return parser
|
||||
Use the subcommands to run different phases of the data extraction process.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command == "sample-repos":
|
||||
options = SampleReposOptions(
|
||||
count=args.count,
|
||||
output_root=args.output_root,
|
||||
query=resolve_query(args.query),
|
||||
per_page=args.per_page,
|
||||
mode=args.mode,
|
||||
run_id=args.run_id,
|
||||
)
|
||||
result = sample_repositories(options)
|
||||
print(f"Run ID: {result['run_id']}")
|
||||
print(f"Collected repositories: {result['count_collected']}")
|
||||
print(f"Repositories file: {result['repos_path']}")
|
||||
print(f"Manifest file: {result['manifest_path']}")
|
||||
if result["seen_index_path"] is not None:
|
||||
print(f"Seen repo index: {result['seen_index_path']}")
|
||||
return 0
|
||||
"""
|
||||
|
||||
if args.command == "fetch-commits":
|
||||
options = FetchCommitsOptions(
|
||||
output_root=args.output_root,
|
||||
run_id=args.run_id,
|
||||
repos_file=args.repos_file,
|
||||
mode=args.mode,
|
||||
per_page=args.per_page,
|
||||
max_pages_per_repo=args.max_pages_per_repo,
|
||||
retry_count=args.retry_count,
|
||||
)
|
||||
result = fetch_commits(options)
|
||||
print(f"Run ID: {result['run_id']}")
|
||||
print(f"Processed repositories: {result['processed_repositories']}")
|
||||
print(f"Completed repositories: {result['completed_repositories']}")
|
||||
print(f"Warning repositories: {result['warning_repositories']}")
|
||||
print(f"Failed repositories: {result['failed_repositories']}")
|
||||
print(f"Commits file: {result['commits_path']}")
|
||||
print(f"Status file: {result['status_path']}")
|
||||
print(f"Manifest file: {result['manifest_path']}")
|
||||
return 0
|
||||
"""
|
||||
@main.command(name="sample-repos")
|
||||
@click.option(
|
||||
"--count",
|
||||
type=int,
|
||||
default=GithubConfig.default_repo_count,
|
||||
help=f"Number of repositories to sample. Defaults to {GithubConfig.default_repo_count}.",
|
||||
)
|
||||
@click.option(
|
||||
"--query",
|
||||
type=str,
|
||||
default=GithubConfig.default_query,
|
||||
help="Optional raw GitHub repository search query. Replaces config defaults when provided.",
|
||||
)
|
||||
@click.option(
|
||||
"--output-root",
|
||||
type=click.Path(path_type=Path),
|
||||
default=Path(GithubConfig.default_output_root),
|
||||
help=f"Directory where run outputs are stored. Defaults to `{GithubConfig.default_output_root}`.",
|
||||
)
|
||||
@click.option(
|
||||
"--mode",
|
||||
type=click.Choice(["append-deduped", "fresh"]),
|
||||
default="fresh",
|
||||
help="Whether to dedupe against the persisted seen-repo index or start a fresh phase-1 run.",
|
||||
)
|
||||
@click.option(
|
||||
"--per-page",
|
||||
type=int,
|
||||
default=GithubConfig.default_per_page,
|
||||
help=f"GitHub Search page size. Defaults to {GithubConfig.default_per_page}.",
|
||||
)
|
||||
@click.option(
|
||||
"--run-id",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Optional run identifier. If omitted, a run id is generated automatically.",
|
||||
)
|
||||
def sample_repos(
|
||||
count: int,
|
||||
query: str | None,
|
||||
output_root: Path,
|
||||
mode: str,
|
||||
per_page: int,
|
||||
run_id: str | None,
|
||||
) -> None:
|
||||
"""
|
||||
Collect repositories from GitHub Search and save phase 1 outputs.
|
||||
|
||||
parser.error(f"Unsupported command: {args.command}")
|
||||
return 1
|
||||
This command searches for repositories matching criteria and saves their metadata
|
||||
to a JSONL file for further processing.
|
||||
"""
|
||||
options = SampleReposOptions(
|
||||
count=count,
|
||||
output_root=output_root,
|
||||
query=resolve_query(query),
|
||||
per_page=per_page,
|
||||
mode=mode,
|
||||
run_id=run_id,
|
||||
)
|
||||
result = sample_repositories(options)
|
||||
click.echo(f"Run ID: {result['run_id']}")
|
||||
click.echo(f"Collected repositories: {result['count_collected']}")
|
||||
click.echo(f"Repositories file: {result['repos_path']}")
|
||||
click.echo(f"Manifest file: {result['manifest_path']}")
|
||||
if result["seen_index_path"] is not None:
|
||||
click.echo(f"Seen repo index: {result['seen_index_path']}")
|
||||
|
||||
|
||||
@main.command(name="fetch-commits")
|
||||
@click.option(
|
||||
"--run-id",
|
||||
type=str,
|
||||
required = True,
|
||||
help="Run identifier whose phase 1 repository dataset should be consumed.",
|
||||
)
|
||||
@click.option(
|
||||
"--repos-file",
|
||||
type=click.Path(exists=True, path_type=Path),
|
||||
help="Path to a phase 1 repos.jsonl file.",
|
||||
)
|
||||
@click.option(
|
||||
"--output-root",
|
||||
type=click.Path(path_type=Path),
|
||||
default=Path(GithubConfig.default_output_root),
|
||||
help=f"Directory where run outputs are stored. Defaults to `{GithubConfig.default_output_root}`.",
|
||||
)
|
||||
@click.option(
|
||||
"--mode",
|
||||
type=click.Choice(["refresh", "resume"]),
|
||||
default="refresh",
|
||||
help="Whether to fetch all repositories from scratch or skip those already marked complete.",
|
||||
)
|
||||
@click.option(
|
||||
"--max-pages-per-repo",
|
||||
type=int,
|
||||
default=GithubConfig.default_max_pages_per_repo,
|
||||
help=(
|
||||
"Maximum number of commit pages to fetch per repository. "
|
||||
f"Defaults to {GithubConfig.default_max_pages_per_repo}."
|
||||
),
|
||||
)
|
||||
@click.option(
|
||||
"--per-page",
|
||||
type=int,
|
||||
default=GithubConfig.default_per_page,
|
||||
help=f"GitHub commit page size. Defaults to {GithubConfig.default_per_page}.",
|
||||
)
|
||||
@click.option(
|
||||
"--retry-count",
|
||||
type=int,
|
||||
default=GithubConfig.default_retry_count,
|
||||
help=f"Number of retries for repository metadata and commit requests. Defaults to {GithubConfig.default_retry_count}.",
|
||||
)
|
||||
def fetch_commits_cmd(
|
||||
run_id: str | None,
|
||||
repos_file: Path | None,
|
||||
output_root: Path,
|
||||
mode: str,
|
||||
max_pages_per_repo: int,
|
||||
per_page: int,
|
||||
retry_count: int,
|
||||
) -> None:
|
||||
"""
|
||||
Fetch commit history for repositories collected in phase 1.
|
||||
|
||||
This command reads a list of repositories and downloads their commit history
|
||||
using the GitHub API.
|
||||
"""
|
||||
if not run_id and not repos_file:
|
||||
raise click.UsageError("Must provide either --run-id or --repos-file")
|
||||
if run_id and repos_file:
|
||||
raise click.UsageError("Cannot provide both --run-id and --repos-file")
|
||||
|
||||
options = FetchCommitsOptions(
|
||||
output_root=output_root,
|
||||
run_id=run_id,
|
||||
repos_file=repos_file,
|
||||
mode=mode,
|
||||
per_page=per_page,
|
||||
max_pages_per_repo=max_pages_per_repo,
|
||||
retry_count=retry_count,
|
||||
)
|
||||
result = fetch_commits(options)
|
||||
click.echo(f"Run ID: {result['run_id']}")
|
||||
click.echo(f"Processed repositories: {result['processed_repositories']}")
|
||||
click.echo(f"Completed repositories: {result['completed_repositories']}")
|
||||
click.echo(f"Warning repositories: {result['warning_repositories']}")
|
||||
click.echo(f"Failed repositories: {result['failed_repositories']}")
|
||||
click.echo(f"Commits file: {result['commits_path']}")
|
||||
click.echo(f"Status file: {result['status_path']}")
|
||||
click.echo(f"Manifest file: {result['manifest_path']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
main()
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
"""
|
||||
Configuration settings for the GitHub data pipeline.
|
||||
|
||||
This module defines the GithubConfig dataclass, which holds default values
|
||||
and endpoints for the GitHub API and the local execution environment.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
@@ -6,6 +13,21 @@ from pathlib import Path
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class GithubConfig:
|
||||
"""
|
||||
Centralized configuration for the GitHub data pipeline.
|
||||
|
||||
Attributes:
|
||||
base_url: The root URL for the GitHub API.
|
||||
api_version: The version of the GitHub API to use.
|
||||
search_repositories_endpoint: The endpoint for searching repositories.
|
||||
default_repo_count: Default number of repositories to sample.
|
||||
default_per_page: Default results per page for API requests.
|
||||
default_output_root: Default root directory for output files.
|
||||
default_query: Default search query for repository sampling.
|
||||
default_max_pages_per_repo: Default maximum pages of commits to fetch per repository.
|
||||
default_retry_count: Default number of retries for failed API requests.
|
||||
user_agent: User agent string to send with GitHub API requests.
|
||||
"""
|
||||
base_url: str = "https://api.github.com"
|
||||
api_version: str = "2022-11-28"
|
||||
search_repositories_endpoint: str = "/search/repositories"
|
||||
@@ -18,4 +40,5 @@ class GithubConfig:
|
||||
user_agent: str = "github-datapipe/0.1.0"
|
||||
|
||||
|
||||
# The root directory of the project, used for resolving relative paths.
|
||||
ROOT_DIR = Path(__file__).resolve().parents[2]
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
"""
|
||||
GitHub API client for the data pipeline.
|
||||
|
||||
This module provides a wrapper around the GitHub REST API, handling authentication,
|
||||
headers, and common requests such as searching repositories and listing commits.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
@@ -12,7 +19,20 @@ class GithubApiError(RuntimeError):
|
||||
|
||||
|
||||
class GithubApiClient:
|
||||
"""
|
||||
A client for interacting with the GitHub REST API.
|
||||
|
||||
Handles session management, authentication headers, and provides methods
|
||||
for specific API endpoints.
|
||||
"""
|
||||
def __init__(self, token: str, session: requests.Session | None = None) -> None:
|
||||
"""
|
||||
Initialize the client with a GitHub token.
|
||||
|
||||
Args:
|
||||
token: GitHub Personal Access Token (PAT).
|
||||
session: Optional pre-configured requests session.
|
||||
"""
|
||||
self._session = session or requests.Session()
|
||||
self._session.headers.update(
|
||||
{
|
||||
@@ -24,21 +44,63 @@ class GithubApiClient:
|
||||
)
|
||||
|
||||
def search_repositories(self, query: str, page: int, per_page: int) -> dict[str, Any]:
|
||||
return self._get_json(
|
||||
"""
|
||||
Search for repositories matching a query.
|
||||
|
||||
Args:
|
||||
query: Raw GitHub search query string.
|
||||
page: Page number to retrieve.
|
||||
per_page: Number of items per page.
|
||||
|
||||
Returns:
|
||||
The JSON response payload from GitHub.
|
||||
"""
|
||||
return self._get_api_resp(
|
||||
GithubConfig.search_repositories_endpoint,
|
||||
params={"q": query, "page": page, "per_page": per_page},
|
||||
)
|
||||
|
||||
def get_repository(self, full_name: str) -> dict[str, Any]:
|
||||
return self._get_json(f"/repos/{full_name}")
|
||||
"""
|
||||
Retrieve metadata for a single repository.
|
||||
|
||||
Args:
|
||||
full_name: The full name of the repository (e.g., 'owner/repo').
|
||||
|
||||
Returns:
|
||||
The JSON response payload from GitHub.
|
||||
"""
|
||||
return self._get_api_resp(f"/repos/{full_name}")
|
||||
|
||||
def list_commits(self, full_name: str, branch: str, page: int, per_page: int) -> list[dict[str, Any]]:
|
||||
return self._get_json(
|
||||
"""
|
||||
List commits for a specific repository and branch.
|
||||
|
||||
Args:
|
||||
full_name: The full name of the repository.
|
||||
branch: The branch name (or SHA) to list commits from.
|
||||
page: Page number to retrieve.
|
||||
per_page: Number of items per page.
|
||||
|
||||
Returns:
|
||||
A list of commit data payloads.
|
||||
"""
|
||||
return self._get_api_resp(
|
||||
f"/repos/{full_name}/commits",
|
||||
params={"sha": branch, "page": page, "per_page": per_page},
|
||||
)
|
||||
|
||||
def _get_json(self, path: str, params: dict[str, Any] | None = None) -> Any:
|
||||
def _get_api_resp(self, path: str, params: dict[str, Any] | None = None) -> Any:
|
||||
"""
|
||||
Perform a GET request and return the JSON response.
|
||||
|
||||
Args:
|
||||
path: The API path (excluding the base URL).
|
||||
params: Optional query parameters.
|
||||
|
||||
Raises:
|
||||
GithubApiError: If the request fails or returns an error status code.
|
||||
"""
|
||||
response = self._session.get(
|
||||
f"{GithubConfig.base_url}{path}",
|
||||
params=params,
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
"""
|
||||
Input/Output utilities for the data pipeline.
|
||||
|
||||
Provides helper functions for reading and writing JSON and JSONL (JSON Lines) files,
|
||||
including support for appending to JSONL files.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
@@ -6,11 +13,13 @@ from typing import Any, Iterable
|
||||
|
||||
|
||||
def write_json(path: Path, payload: Any) -> None:
|
||||
"""Write a Python object to a file as formatted JSON."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> None:
|
||||
"""Write an iterable of dictionaries to a file in JSONL format."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with path.open("w", encoding="utf-8") as handle:
|
||||
for row in rows:
|
||||
@@ -19,6 +28,12 @@ def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> None:
|
||||
|
||||
|
||||
def append_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> int:
|
||||
"""
|
||||
Append an iterable of dictionaries to an existing JSONL file.
|
||||
|
||||
Returns:
|
||||
The number of rows successfully appended.
|
||||
"""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
count = 0
|
||||
with path.open("a", encoding="utf-8") as handle:
|
||||
@@ -30,10 +45,16 @@ def append_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> int:
|
||||
|
||||
|
||||
def read_json(path: Path) -> Any:
|
||||
"""Read and parse a JSON file."""
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def read_jsonl(path: Path) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Read a JSONL file and return a list of dictionaries.
|
||||
|
||||
Returns an empty list if the file does not exist.
|
||||
"""
|
||||
if not path.exists():
|
||||
return []
|
||||
rows: list[dict[str, Any]] = []
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
"""
|
||||
Runtime environment and utility helpers.
|
||||
|
||||
Handles loading environment variables (like GitHub tokens), generating
|
||||
unique run identifiers, and providing consistent UTC timestamps.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
@@ -10,6 +17,12 @@ load_dotenv()
|
||||
|
||||
|
||||
def require_github_token() -> str:
|
||||
"""
|
||||
Ensure a GitHub token is available in the environment.
|
||||
|
||||
Raises:
|
||||
ValueError: If the `github_token` environment variable is missing.
|
||||
"""
|
||||
github_token = os.getenv("github_token")
|
||||
if not github_token:
|
||||
raise ValueError("GitHub token is not available in .env as `github_token`.")
|
||||
@@ -17,8 +30,10 @@ def require_github_token() -> str:
|
||||
|
||||
|
||||
def build_run_id() -> str:
|
||||
"""Generate a unique run ID based on the current timestamp and a random suffix."""
|
||||
return f"run-{datetime.now(tz=UTC).strftime('%Y%m%dT%H%M%SZ')}-{uuid4().hex[:8]}"
|
||||
|
||||
|
||||
def utc_now() -> str:
|
||||
"""Return the current UTC time in ISO 8601 format."""
|
||||
return datetime.now(tz=UTC).isoformat()
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
"""
|
||||
Phase 1: Repository Sampling Service.
|
||||
|
||||
This service is responsible for searching GitHub for repositories based on a query,
|
||||
filtering out previously seen repositories (if requested), and saving the results.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
@@ -12,6 +19,7 @@ from github_datapipe.core.runtime import build_run_id, require_github_token, utc
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SampleReposOptions:
|
||||
"""Options for the repository sampling phase."""
|
||||
count: int = GithubConfig.default_repo_count
|
||||
output_root: Path = Path(GithubConfig.default_output_root)
|
||||
query: str = GithubConfig.default_query
|
||||
@@ -21,10 +29,23 @@ class SampleReposOptions:
|
||||
|
||||
|
||||
def resolve_query(query_override: str | None) -> str:
|
||||
"""Resolve the search query, using the override if provided, else the default."""
|
||||
return query_override.strip() if query_override else GithubConfig.default_query
|
||||
|
||||
|
||||
def sample_repositories(options: SampleReposOptions) -> dict[str, Any]:
|
||||
"""
|
||||
Execute Phase 1: Sample repositories from GitHub.
|
||||
|
||||
Searches for repositories, handles pagination and deduplication, and saves
|
||||
the collected repository data and a run manifest.
|
||||
|
||||
Args:
|
||||
options: Configuration options for sampling.
|
||||
|
||||
Returns:
|
||||
A dictionary containing the run ID and paths to generated files.
|
||||
"""
|
||||
token = require_github_token()
|
||||
if options.count <= 0:
|
||||
raise ValueError("`count` must be greater than 0.")
|
||||
@@ -121,6 +142,7 @@ def normalize_repo_record(
|
||||
sample_page: int,
|
||||
sampled_at: str,
|
||||
) -> dict[str, Any]:
|
||||
"""Normalize a raw GitHub repository payload into our internal schema."""
|
||||
return {
|
||||
"run_id": run_id,
|
||||
"repo_id": repo["id"],
|
||||
@@ -142,6 +164,7 @@ def normalize_repo_record(
|
||||
|
||||
|
||||
def load_seen_repo_ids(path: Path) -> set[int]:
|
||||
"""Load the set of repository IDs that have already been sampled."""
|
||||
if not path.exists():
|
||||
return set()
|
||||
payload = read_json(path)
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
"""
|
||||
Phase 2: Commit Ingestion Service.
|
||||
|
||||
This service reads a list of repositories (usually from Phase 1), and for each one,
|
||||
fetches its commit history from the GitHub API and saves it in JSONL format.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
@@ -12,6 +19,7 @@ from github_datapipe.core.runtime import require_github_token, utc_now
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FetchCommitsOptions:
|
||||
"""Options for the commit ingestion phase."""
|
||||
output_root: Path = Path(GithubConfig.default_output_root)
|
||||
run_id: str | None = None
|
||||
repos_file: Path | None = None
|
||||
@@ -22,6 +30,18 @@ class FetchCommitsOptions:
|
||||
|
||||
|
||||
def fetch_commits(options: FetchCommitsOptions) -> dict[str, Any]:
|
||||
"""
|
||||
Execute Phase 2: Fetch commit history for multiple repositories.
|
||||
|
||||
Iterates through repositories, downloads commits, handles errors and retries,
|
||||
and updates the run manifest and status records.
|
||||
|
||||
Args:
|
||||
options: Configuration options for ingestion.
|
||||
|
||||
Returns:
|
||||
A dictionary containing processing statistics and file paths.
|
||||
"""
|
||||
token = require_github_token()
|
||||
if options.max_pages_per_repo <= 0:
|
||||
raise ValueError("`max_pages_per_repo` must be greater than 0.")
|
||||
@@ -123,6 +143,7 @@ def fetch_commits(options: FetchCommitsOptions) -> dict[str, Any]:
|
||||
|
||||
|
||||
def resolve_repo_input(options: FetchCommitsOptions, output_root: Path) -> tuple[Path, str]:
|
||||
"""Resolve the input repository list from either a run ID or a specific file path."""
|
||||
if options.run_id is not None:
|
||||
return (
|
||||
output_root / options.run_id / "phase1_repository_sampling" / "repos.jsonl",
|
||||
@@ -145,6 +166,7 @@ def process_repository(
|
||||
max_pages_per_repo: int,
|
||||
retry_count: int,
|
||||
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
|
||||
"""Fetch all commits for a single repository, handling pagination and errors."""
|
||||
full_name = repo["full_name"]
|
||||
fetched_at = utc_now()
|
||||
|
||||
@@ -230,6 +252,7 @@ def normalize_commit_record(
|
||||
page_number: int,
|
||||
fetched_at: str,
|
||||
) -> dict[str, Any]:
|
||||
"""Normalize a raw GitHub commit payload into our internal schema."""
|
||||
commit = commit_payload["commit"]
|
||||
author = commit.get("author") or {}
|
||||
committer = commit.get("committer") or {}
|
||||
@@ -258,6 +281,7 @@ def normalize_commit_record(
|
||||
|
||||
|
||||
def with_retries(operation: Any, retry_count: int) -> Any:
|
||||
"""Execute a function with a specified number of retries on GitHubApiError."""
|
||||
last_error: Exception | None = None
|
||||
for _ in range(retry_count + 1):
|
||||
try:
|
||||
@@ -269,6 +293,7 @@ def with_retries(operation: Any, retry_count: int) -> Any:
|
||||
|
||||
|
||||
def load_status_index(path: Path) -> dict[int, str]:
|
||||
"""Load an index of repository processing statuses to support resumption."""
|
||||
rows = read_jsonl(path)
|
||||
latest_by_repo: dict[int, str] = {}
|
||||
for row in rows:
|
||||
@@ -277,6 +302,7 @@ def load_status_index(path: Path) -> dict[int, str]:
|
||||
|
||||
|
||||
def dedupe_status_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""Deduplicate status records, keeping only the latest record for each repository."""
|
||||
latest_by_repo: dict[int, dict[str, Any]] = {}
|
||||
order: list[int] = []
|
||||
for row in rows:
|
||||
@@ -288,6 +314,7 @@ def dedupe_status_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
|
||||
|
||||
def count_jsonl_rows(path: Path) -> int:
|
||||
"""Return the number of lines in a JSONL file."""
|
||||
if not path.exists():
|
||||
return 0
|
||||
return len(path.read_text(encoding="utf-8").splitlines())
|
||||
|
||||
Reference in New Issue
Block a user