From 8e09e33acbf0aded6ea0014c9f88f6723702e27d Mon Sep 17 00:00:00 2001 From: jc Date: Wed, 3 Dec 2025 20:18:30 -0500 Subject: [PATCH 1/5] improvements to error detection logic --- infra/helm/api-forge/values.yaml | 4 +- scripts/test_copier_generation.sh | 2 +- src/cli/deployment/helm_deployer/validator.py | 143 ++++++++++++++---- src/cli/deployment/shell_commands/kubectl.py | 18 ++- tests/unit/cli/deployment/test_validator.py | 102 ++++++++++++- 5 files changed, 226 insertions(+), 43 deletions(-) diff --git a/infra/helm/api-forge/values.yaml b/infra/helm/api-forge/values.yaml index 8ce2624..191c967 100644 --- a/infra/helm/api-forge/values.yaml +++ b/infra/helm/api-forge/values.yaml @@ -80,7 +80,7 @@ postgres: # Redis Cache/Sessions # ============================================================================= redis: - enabled: false + enabled: true image: repository: app_data_redis_image tag: latest @@ -122,7 +122,7 @@ redis: # Temporal Workflow Engine # ============================================================================= temporal: - enabled: false + enabled: true image: repository: my-temporal-server tag: latest diff --git a/scripts/test_copier_generation.sh b/scripts/test_copier_generation.sh index dc6321f..ee0ce46 100755 --- a/scripts/test_copier_generation.sh +++ b/scripts/test_copier_generation.sh @@ -67,7 +67,7 @@ log_info "Output directory: $TEST_OUTPUT_DIR" # Use --vcs-ref HEAD to use the current state (including uncommitted changes) # Use --data to override specific answers for testing optional features copier copy --trust --defaults --vcs-ref HEAD \ - --data use_temporal=true --data use_redis=true \ + --data use_temporal=false --data use_redis=false \ "$TEMPLATE_DIR" "$TEST_OUTPUT_DIR" if [ $? -eq 0 ]; then diff --git a/src/cli/deployment/helm_deployer/validator.py b/src/cli/deployment/helm_deployer/validator.py index e6eed31..d9040f8 100644 --- a/src/cli/deployment/helm_deployer/validator.py +++ b/src/cli/deployment/helm_deployer/validator.py @@ -182,11 +182,13 @@ def prompt_cleanup(self, result: ValidationResult, namespace: str) -> bool: "[yellow]Recommended: Run the following command to clean up:[/yellow]" ) self.console.print( - "[bold cyan] uv run api-forge-cli deploy down k8s --volumes[/bold cyan]\n" + "[bold cyan] uv run api-forge-cli deploy down k8s[/bold cyan]\n" ) self.console.print( - "[dim]This will delete all resources and persistent volumes, " - "allowing a fresh deployment.[/dim]\n" + "[dim]This will delete the Helm release and allow a fresh deployment.[/dim]" + ) + self.console.print( + "[dim]Add --volumes only if you need to wipe persistent data (databases, etc).[/dim]\n" ) # Prompt user @@ -203,11 +205,8 @@ def prompt_cleanup(self, result: ValidationResult, namespace: str) -> bool: self.console.print( "[bold yellow]Errors detected that may cause deployment issues.[/bold yellow]\n" ) - self.console.print("[yellow]Consider running cleanup first:[/yellow]") - self.console.print( - "[bold cyan] uv run api-forge-cli deploy down k8s --volumes[/bold cyan]\n" - ) + # Prompt user try: response = ( input("Proceed with deployment anyway? [y/N]: ").strip().lower() @@ -290,25 +289,46 @@ def _has_helm_release(self, namespace: str) -> bool: return any(r.name == self.constants.HELM_RELEASE_NAME for r in releases) def _check_failed_jobs(self, namespace: str, result: ValidationResult) -> None: - """Check for failed jobs in the namespace.""" + """Check for failed jobs in the namespace. + + Only flags jobs that have actually failed (exhausted retries with no + success). Jobs that are still running or have completed successfully + are not flagged, even if they had previous failed attempts. + + Failed jobs are flagged as warnings since Kubernetes will often retry + them, and they may succeed on subsequent attempts as dependencies + come online. + """ jobs = self.commands.kubectl.get_jobs(namespace) for job in jobs: - if job.get("status") == "Failed": + job_name = job["name"] + job_status = job.get("status") + + # If job succeeded, it's fine - ignore any previous failures + if job_status == "Complete": + continue + + # If job is still running, don't flag it + if job_status == "Running": + continue + + # Job failed - flag as warning (may be transient) + if job_status == "Failed": result.issues.append( ValidationIssue( - severity=ValidationSeverity.ERROR, - title=f"Failed job: {job['name']}", + severity=ValidationSeverity.WARNING, + title=f"Job has failures: {job_name}", description=( - f"Job '{job['name']}' failed. This may indicate " - "initialization or configuration problems." + f"Job '{job_name}' has failed attempts. This may be " + "transient during startup while dependencies initialize." ), recovery_hint=( - "Delete the failed job and redeploy, or run full cleanup " - "with 'deploy down k8s --volumes'" + f"Check logs: 'kubectl logs job/{job_name} -n {namespace}'. " + f"Delete job to retry: 'kubectl delete job {job_name} -n {namespace}'" ), resource_type="Job", - resource_name=job["name"], + resource_name=job_name, ) ) @@ -318,21 +338,22 @@ def _check_crashloop_pods(self, namespace: str, result: ValidationResult) -> Non for pod in pods: if pod.get("status") == "CrashLoopBackOff": + pod_name = str(pod["name"]) restarts = pod.get("restarts", 0) result.issues.append( ValidationIssue( severity=ValidationSeverity.ERROR, - title=f"Pod in CrashLoopBackOff: {pod['name']}", + title=f"Pod in CrashLoopBackOff: {pod_name}", description=( - f"Pod '{pod['name']}' is crash-looping ({restarts} restarts). " + f"Pod '{pod_name}' is crash-looping ({restarts} restarts). " "This usually indicates configuration or dependency issues." ), recovery_hint=( - "Check pod logs with 'kubectl logs {name} -n {namespace}', " + f"Check pod logs with 'kubectl logs {pod_name} -n {namespace}', " "then fix the issue or run cleanup" - ).format(name=pod["name"], namespace=namespace), + ), resource_type="Pod", - resource_name=pod["name"], + resource_name=pod_name, ) ) @@ -342,44 +363,102 @@ def _check_pending_pods(self, namespace: str, result: ValidationResult) -> None: for pod in pods: if pod.get("status") == "Pending": + pod_name = str(pod["name"]) # Check if it's been pending for a while (ignore recently created) # For now, treat all Pending as warnings result.issues.append( ValidationIssue( severity=ValidationSeverity.WARNING, - title=f"Pod pending: {pod['name']}", + title=f"Pod pending: {pod_name}", description=( - f"Pod '{pod['name']}' is stuck in Pending state. " + f"Pod '{pod_name}' is stuck in Pending state. " "This may indicate resource constraints or scheduling issues." ), recovery_hint=( - "Check events with 'kubectl describe pod {name} -n {namespace}'" - ).format(name=pod["name"], namespace=namespace), + f"Check events with 'kubectl describe pod {pod_name} -n {namespace}'" + ), resource_type="Pod", - resource_name=pod["name"], + resource_name=pod_name, ) ) def _check_error_pods(self, namespace: str, result: ValidationResult) -> None: - """Check for pods in Error state.""" + """Check for pods in Error state. + + For pods owned by Jobs, only considers the most recent pod per job. + This avoids flagging old failed attempts when the job has since + succeeded or has a newer attempt in progress. + """ pods = self.commands.kubectl.get_pods(namespace) + # Group job-owned pods by their job name + job_pods: dict[str, list[dict[str, str | int]]] = {} + non_job_pods: list[dict[str, str | int]] = [] + for pod in pods: + job_owner = str(pod.get("jobOwner", "")) + if job_owner: + if job_owner not in job_pods: + job_pods[job_owner] = [] + job_pods[job_owner].append(pod) + else: + non_job_pods.append(pod) + + # Check non-job pods for errors (these are always relevant) + for pod in non_job_pods: if pod.get("status") == "Error": + pod_name = str(pod["name"]) result.issues.append( ValidationIssue( severity=ValidationSeverity.ERROR, - title=f"Pod in Error state: {pod['name']}", + title=f"Pod in Error state: {pod_name}", description=( - f"Pod '{pod['name']}' is in Error state. " + f"Pod '{pod_name}' is in Error state. " "Check logs to determine the cause." ), recovery_hint=( - "Check pod logs with 'kubectl logs {name} -n {namespace}', " + f"Check pod logs with 'kubectl logs {pod_name} -n {namespace}', " "then fix the issue or run cleanup" - ).format(name=pod["name"], namespace=namespace), + ), + resource_type="Pod", + resource_name=pod_name, + ) + ) + + # For job-owned pods, only check the most recent pod per job + for job_name, pods_list in job_pods.items(): + # Sort by creation timestamp (newest first) + # ISO 8601 timestamps sort correctly as strings + sorted_pods = sorted( + pods_list, + key=lambda p: str(p.get("creationTimestamp", "")), + reverse=True, + ) + + if not sorted_pods: + continue + + most_recent_pod = sorted_pods[0] + pod_status = most_recent_pod.get("status") + + # Only flag if the most recent pod is in Error state + # Completed/Succeeded pods are fine, older failed pods are irrelevant + if pod_status == "Error": + pod_name = str(most_recent_pod["name"]) + result.issues.append( + ValidationIssue( + severity=ValidationSeverity.WARNING, + title=f"Job pod in Error state: {pod_name}", + description=( + f"Most recent pod for job '{job_name}' is in Error state. " + "This may be transient if the job will retry." + ), + recovery_hint=( + f"Check logs: 'kubectl logs {pod_name} -n {namespace}'. " + f"Delete job to retry: 'kubectl delete job {job_name} -n {namespace}'" + ), resource_type="Pod", - resource_name=pod["name"], + resource_name=pod_name, ) ) diff --git a/src/cli/deployment/shell_commands/kubectl.py b/src/cli/deployment/shell_commands/kubectl.py index dbe21fd..575e3d1 100644 --- a/src/cli/deployment/shell_commands/kubectl.py +++ b/src/cli/deployment/shell_commands/kubectl.py @@ -486,14 +486,15 @@ def wait_for_pods( capture_output=False, ) - def get_pods(self, namespace: str) -> list[dict[str, str]]: + def get_pods(self, namespace: str) -> list[dict[str, str | int]]: """Get all pods in a namespace with their status. Args: namespace: Kubernetes namespace Returns: - List of dicts with pod name, status, and restarts + List of dicts with pod name, status, restarts, creation timestamp, + and job owner (if pod is owned by a Job) """ result = self._runner.run( ["kubectl", "get", "pods", "-n", namespace, "-o", "json"], @@ -507,9 +508,18 @@ def get_pods(self, namespace: str) -> list[dict[str, str]]: pods = [] for pod in data.get("items", []): - name = pod.get("metadata", {}).get("name", "") + metadata = pod.get("metadata", {}) + name = metadata.get("name", "") + creation_timestamp = metadata.get("creationTimestamp", "") status = pod.get("status", {}) + # Check if pod is owned by a Job + job_owner = "" + for owner_ref in metadata.get("ownerReferences", []): + if owner_ref.get("kind") == "Job": + job_owner = owner_ref.get("name", "") + break + # Determine pod status phase = status.get("phase", "Unknown") container_statuses = status.get("containerStatuses", []) @@ -535,6 +545,8 @@ def get_pods(self, namespace: str) -> list[dict[str, str]]: "name": name, "status": pod_status, "restarts": restarts, + "creationTimestamp": creation_timestamp, + "jobOwner": job_owner, } ) diff --git a/tests/unit/cli/deployment/test_validator.py b/tests/unit/cli/deployment/test_validator.py index b92608f..6dbf336 100644 --- a/tests/unit/cli/deployment/test_validator.py +++ b/tests/unit/cli/deployment/test_validator.py @@ -160,7 +160,11 @@ def test_validate_existing_namespace_no_issues( def test_validate_detects_failed_jobs( self, validator: DeploymentValidator, mock_commands: MagicMock ) -> None: - """Validation should detect failed Kubernetes jobs.""" + """Validation should detect failed Kubernetes jobs. + + Init jobs like postgres-verifier are expected to have transient failures + during startup, so they should be flagged as warnings, not errors. + """ mock_commands.kubectl.namespace_exists.return_value = True mock_commands.helm.list_releases.return_value = [] mock_commands.kubectl.get_jobs.return_value = [ @@ -172,9 +176,30 @@ def test_validate_detects_failed_jobs( assert result.is_clean is False assert len(result.issues) == 1 - assert result.issues[0].severity == ValidationSeverity.ERROR + # Init jobs like postgres-verifier are expected to have transient failures + # during startup, so they should be WARNING not ERROR + assert result.issues[0].severity == ValidationSeverity.WARNING assert "postgres-verifier" in result.issues[0].title + def test_validate_any_failed_job_is_warning( + self, validator: DeploymentValidator, mock_commands: MagicMock + ) -> None: + """All failed jobs should be flagged as warnings (may be transient).""" + mock_commands.kubectl.namespace_exists.return_value = True + mock_commands.helm.list_releases.return_value = [] + mock_commands.kubectl.get_jobs.return_value = [ + {"name": "migration-job", "status": "Failed"}, + ] + mock_commands.kubectl.get_pods.return_value = [] + + result = validator.validate("api-forge-prod") + + assert result.is_clean is False + assert len(result.issues) == 1 + # All failed jobs are warnings since they may be transient + assert result.issues[0].severity == ValidationSeverity.WARNING + assert "migration-job" in result.issues[0].title + def test_validate_detects_crashloop_pods( self, validator: DeploymentValidator, mock_commands: MagicMock ) -> None: @@ -231,6 +256,72 @@ def test_validate_detects_error_pods( assert result.issues[0].severity == ValidationSeverity.ERROR assert "Error" in result.issues[0].title + def test_validate_job_pods_only_checks_most_recent( + self, validator: DeploymentValidator, mock_commands: MagicMock + ) -> None: + """For job-owned pods, only the most recent pod should be checked. + + If old pods from a job are in Error state but a newer pod succeeded, + we should not flag the old errors. + """ + mock_commands.kubectl.namespace_exists.return_value = True + mock_commands.helm.list_releases.return_value = [] + mock_commands.kubectl.get_jobs.return_value = [] + mock_commands.kubectl.get_pods.return_value = [ + # Old pod from first attempt - failed + { + "name": "postgres-verifier-abc", + "status": "Error", + "jobOwner": "postgres-verifier", + "creationTimestamp": "2025-01-01T10:00:00Z", + }, + # Newer pod from second attempt - succeeded + { + "name": "postgres-verifier-def", + "status": "Succeeded", + "jobOwner": "postgres-verifier", + "creationTimestamp": "2025-01-01T10:05:00Z", + }, + ] + + result = validator.validate("api-forge-prod") + + # Should be clean - the most recent pod succeeded + assert result.is_clean is True + assert len(result.issues) == 0 + + def test_validate_job_pods_flags_if_most_recent_failed( + self, validator: DeploymentValidator, mock_commands: MagicMock + ) -> None: + """If the most recent job pod is in Error state, flag it as a warning.""" + mock_commands.kubectl.namespace_exists.return_value = True + mock_commands.helm.list_releases.return_value = [] + mock_commands.kubectl.get_jobs.return_value = [] + mock_commands.kubectl.get_pods.return_value = [ + # Old pod succeeded + { + "name": "postgres-verifier-abc", + "status": "Succeeded", + "jobOwner": "postgres-verifier", + "creationTimestamp": "2025-01-01T10:00:00Z", + }, + # Newer pod failed + { + "name": "postgres-verifier-def", + "status": "Error", + "jobOwner": "postgres-verifier", + "creationTimestamp": "2025-01-01T10:05:00Z", + }, + ] + + result = validator.validate("api-forge-prod") + + # Should flag the most recent failed pod as a warning + assert result.is_clean is False + assert len(result.issues) == 1 + assert result.issues[0].severity == ValidationSeverity.WARNING + assert "postgres-verifier-def" in result.issues[0].title + def test_validate_detects_multiple_issues( self, validator: DeploymentValidator, mock_commands: MagicMock ) -> None: @@ -249,10 +340,11 @@ def test_validate_detects_multiple_issues( assert result.is_clean is False assert len(result.issues) == 3 - # Should have 2 ERRORs (failed job + crashloop), 1 WARNING (pending) + # Should have 1 ERROR (crashloop), 2 WARNINGs (init job + pending) + # postgres-verifier is an init job so it's a WARNING, not ERROR severities = [issue.severity for issue in result.issues] - assert severities.count(ValidationSeverity.ERROR) == 2 - assert ValidationSeverity.WARNING in severities + assert severities.count(ValidationSeverity.ERROR) == 1 + assert severities.count(ValidationSeverity.WARNING) == 2 def test_display_results_clean( self, From 354bcca881d0c53ce480d2aebeb25e9fb7b90d10 Mon Sep 17 00:00:00 2001 From: jc Date: Wed, 3 Dec 2025 21:07:21 -0500 Subject: [PATCH 2/5] refactor(api): standardize health endpoint responses with Pydantic models - Add ServiceStatus enum (HEALTHY, UNHEALTHY, DEGRADED, DISABLED) for consistent terminology - Create HealthCheckService with dependency injection pattern - Refactor health router to use typed Pydantic response models - Add workflow schemas (WorkflowStartRequest/Response, etc.) - Improve workflows router with proper response_model annotations - Add comprehensive docstrings and OpenAPI documentation - Add unit tests for health endpoints (13 tests) and HealthCheckService (21 tests) - All 490 unit tests passing --- src/app/api/http/routers/health.py | 384 ++++++++---------- src/app/api/http/routers/workflows.py | 171 ++++++-- src/app/api/http/schemas/__init__.py | 61 +++ src/app/api/http/schemas/health.py | 237 +++++++++++ src/app/api/http/schemas/workflows.py | 128 ++++++ src/app/core/services/health_service.py | 350 ++++++++++++++++ .../api/http/routers/test_health_endpoints.py | 316 ++++++++++++++ .../app/core/services/test_health_service.py | 378 +++++++++++++++++ 8 files changed, 1783 insertions(+), 242 deletions(-) create mode 100644 src/app/api/http/schemas/health.py create mode 100644 src/app/api/http/schemas/workflows.py create mode 100644 src/app/core/services/health_service.py create mode 100644 tests/unit/app/api/http/routers/test_health_endpoints.py create mode 100644 tests/unit/app/core/services/test_health_service.py diff --git a/src/app/api/http/routers/health.py b/src/app/api/http/routers/health.py index f673986..2787ed9 100644 --- a/src/app/api/http/routers/health.py +++ b/src/app/api/http/routers/health.py @@ -1,259 +1,235 @@ -"""Health check endpoints router for monitoring service availability.""" +"""Health check endpoints router for monitoring service availability. -from typing import Any +This module provides HTTP endpoints for health checking the application +and its dependencies. These endpoints are used by: -from fastapi import APIRouter, Request +- Kubernetes liveness probes (/health) +- Kubernetes readiness probes (/health/ready) +- Monitoring systems +- Load balancers + +Endpoint Summary: + GET /health - Liveness probe (app is running) + GET /health/ready - Readiness probe (all services operational) + GET /health/database - Database-specific health check + GET /health/redis - Redis-specific health check + GET /health/temporal - Temporal-specific health check +""" + +from __future__ import annotations + +from fastapi import APIRouter, Depends, Request from starlette.responses import JSONResponse from src.app.api.http.app_data import ApplicationDependencies +from src.app.api.http.schemas.health import ( + DatabaseHealthDetailed, + HealthCheckError, + LivenessResponse, + OverallStatus, + ReadinessResponse, + RedisHealthDetailed, + ServiceStatus, + TemporalHealthDetailed, +) +from src.app.core.services.health_service import HealthCheckService from src.app.runtime.context import get_config router = APIRouter(prefix="/health", tags=["health"]) -@router.get("") -async def health() -> dict[str, str]: +# ============================================================================= +# Dependencies +# ============================================================================= + + +def get_health_service(request: Request) -> HealthCheckService: + """Get the health check service instance. + + This dependency creates a HealthCheckService with the application's + dependencies and configuration. + """ + app_deps: ApplicationDependencies = request.app.state.app_dependencies + config = get_config() + return HealthCheckService(app_deps, config) + + +# ============================================================================= +# Liveness Probe +# ============================================================================= + + +@router.get( + "", + response_model=LivenessResponse, + summary="Liveness probe", + description="Basic health check - returns 200 if the application process is running.", +) +async def health() -> LivenessResponse: """Basic health check endpoint - checks if app is running. This is a liveness probe that returns 200 OK as long as the application process is running. It does not check dependencies. - """ - return {"status": "healthy", "service": "api"} - -@router.get("/ready", response_model=None) -async def readiness(request: Request) -> dict[str, Any] | JSONResponse: + Use this endpoint for Kubernetes liveness probes. + """ + return LivenessResponse() + + +# ============================================================================= +# Readiness Probe +# ============================================================================= + + +@router.get( + "/ready", + response_model=ReadinessResponse, + responses={ + 200: {"description": "All services are ready"}, + 503: { + "description": "One or more services are not ready", + "model": ReadinessResponse, + }, + }, + summary="Readiness probe", + description="Comprehensive readiness check - validates all service dependencies.", +) +async def readiness( + health_service: HealthCheckService = Depends(get_health_service), +) -> ReadinessResponse | JSONResponse: """Comprehensive readiness check - validates all service dependencies. - Returns 200 if all services are ready, 503 if any service is unavailable. + Returns 200 if all critical services are ready, 503 if any are unavailable. This checks: - - Database connectivity - - Redis (non-critical, falls back to in-memory) - - Temporal (if enabled) + - Database connectivity (critical) + - Redis connectivity (non-critical, falls back to in-memory) + - Temporal connectivity (critical if enabled) - OIDC providers (critical in production only) - """ - from pydantic import BaseModel - - # Simple model for health check test data - class HealthCheckData(BaseModel): - test: bool - app_deps: ApplicationDependencies = request.app.state.app_dependencies - config = get_config() - - checks: dict[str, Any] = {} - all_healthy = True - - # Database health check - try: - db_healthy = app_deps.database_service.health_check() - checks["database"] = { - "status": "healthy" if db_healthy else "unhealthy", - "type": "postgresql" if "postgresql" in config.database.url else "sqlite", - } - if not db_healthy: - all_healthy = False - except Exception as e: - checks["database"] = { - "status": "unhealthy", - "error": str(e), - } - all_healthy = False + Use this endpoint for Kubernetes readiness probes. + """ + result = await health_service.check_all() - # Redis health check - try: - if app_deps.redis_service: - redis_healthy = await app_deps.redis_service.health_check() - else: - redis_healthy = False # Redis service not configured - - checks["redis"] = { - "status": "healthy" - if redis_healthy - else "unhealthy" - if config.redis.enabled - else "degraded", - "type": "redis" if config.redis.enabled else "in-memory", - } - # Redis failure is not critical - we fall back to in-memory - # So we don't set all_healthy = False here - except Exception as e: - checks["redis"] = { - "status": "degraded", - "type": "in-memory", - "note": "Using in-memory storage", - "error": str(e) if config.redis.enabled else "Redis not configured", - } - # Redis failure is not critical - we fall back to in-memory - - # Temporal health check - if config.temporal.enabled: - try: - temporal_healthy = await app_deps.temporal_service.health_check() - checks["temporal"] = { - "status": "healthy" if temporal_healthy else "unhealthy", - "url": app_deps.temporal_service.url, - "namespace": app_deps.temporal_service.namespace, - } - if not temporal_healthy: - all_healthy = False - except Exception as e: - checks["temporal"] = { - "status": "unhealthy", - "error": str(e), - } - all_healthy = False - else: - checks["temporal"] = { - "status": "disabled", - "note": "Temporal service is not enabled", - } - - # OIDC providers check (verify JWKS endpoints are reachable) - oidc_checks = {} - if config.oidc.providers: - jwks_service = app_deps.jwks_service - for provider_name, provider_config in config.oidc.providers.items(): - try: - # Try to fetch JWKS to verify provider is reachable - _ = await jwks_service.fetch_jwks(provider_config) - oidc_checks[provider_name] = { - "status": "healthy", - "issuer": provider_config.issuer, - } - except Exception as e: - oidc_checks[provider_name] = { - "status": "unhealthy", - "issuer": provider_config.issuer, - "error": str(e), - } - # OIDC failures are not critical for non-production - if config.app.environment == "production": - all_healthy = False - - if oidc_checks: - checks["oidc_providers"] = oidc_checks - - # Build response - response = { - "status": "ready" if all_healthy else "not_ready", - "environment": config.app.environment, - "checks": checks, - } - - # Return 503 if not all services are healthy - if not all_healthy: + if result.status == OverallStatus.NOT_READY: return JSONResponse( status_code=503, - content=response, + content=result.model_dump(mode="json"), ) - return response + return result + + +# ============================================================================= +# Individual Service Health Checks +# ============================================================================= + +@router.get( + "/database", + response_model=DatabaseHealthDetailed, + responses={ + 503: { + "description": "Database is unhealthy", + "model": HealthCheckError, + }, + }, + summary="Database health check", + description="Database-specific health check with connection pool status.", +) +async def health_database(request: Request) -> DatabaseHealthDetailed | JSONResponse: + """Database-specific health check with connection pool status. -@router.get("/database", response_model=None) -async def health_database(request: Request) -> dict[str, Any] | JSONResponse: - """Database-specific health check with connection pool status.""" + Returns detailed information about the database connection including + pool statistics when available. + """ app_deps: ApplicationDependencies = request.app.state.app_dependencies config = get_config() + db_type = "postgresql" if "postgresql" in config.database.url else "sqlite" + try: healthy = app_deps.database_service.health_check() pool_status = app_deps.database_service.get_pool_status() - return { - "status": "healthy" if healthy else "unhealthy", - "type": "postgresql" if "postgresql" in config.database.url else "sqlite", - "pool": pool_status, - } + # Convert pool_status to compatible type + pool: dict[str, int | str] | None = dict(pool_status) if pool_status else None + + return DatabaseHealthDetailed( + status=ServiceStatus.HEALTHY if healthy else ServiceStatus.UNHEALTHY, + type=db_type, + pool=pool, + ) except Exception as e: return JSONResponse( status_code=503, - content={ - "status": "unhealthy", - "error": str(e), - "error_type": type(e).__name__, - }, + content=HealthCheckError( + status=ServiceStatus.UNHEALTHY, + error=str(e), + error_type=type(e).__name__, + ).model_dump(mode="json"), ) -@router.get("/redis", response_model=None) -async def health_redis(request: Request) -> dict[str, Any] | JSONResponse: +@router.get( + "/redis", + response_model=RedisHealthDetailed, + responses={ + 503: { + "description": "Redis is unhealthy", + "model": HealthCheckError, + }, + }, + summary="Redis health check", + description="Redis-specific health check with server information.", +) +async def health_redis( + health_service: HealthCheckService = Depends(get_health_service), +) -> RedisHealthDetailed | JSONResponse: """Redis-specific health check using actual Redis operations. This performs a real test operation (set/get/delete) to verify Redis is functioning correctly, not just a simple PING. - """ - app_deps: ApplicationDependencies = request.app.state.app_dependencies - config = get_config() - if not config.redis.enabled: - return { - "status": "disabled", - "type": "in-memory", - "note": "Redis is not enabled, using in-memory storage", - } + Returns detailed Redis server information when available. + """ + result = await health_service.check_redis_detailed() - try: - # Use the health_check method from RedisService - if app_deps.redis_service: - healthy = await app_deps.redis_service.health_check() - # Optionally get more detailed info - info = await app_deps.redis_service.get_info() - else: - healthy = False # Redis service not configured - info = None - - result: dict[str, Any] = { - "status": "healthy" if healthy else "unhealthy", - "type": "redis", - "url": config.redis.url, - } - - if info: - result["info"] = info - - return result - except Exception as e: + if result.status == ServiceStatus.UNHEALTHY: return JSONResponse( status_code=503, - content={ - "status": "unhealthy", - "type": "redis", - "error": str(e), - "error_type": type(e).__name__, - "fallback": "in-memory storage", - }, + content=result.model_dump(mode="json"), ) + return result + + +@router.get( + "/temporal", + response_model=TemporalHealthDetailed, + responses={ + 503: { + "description": "Temporal is unhealthy", + "model": HealthCheckError, + }, + }, + summary="Temporal health check", + description="Temporal workflow service health check.", +) +async def health_temporal( + health_service: HealthCheckService = Depends(get_health_service), +) -> TemporalHealthDetailed | JSONResponse: + """Temporal-specific health check. + + Returns connection status and configuration details for the + Temporal workflow service. + """ + result = await health_service.check_temporal_detailed() -@router.get("/temporal", response_model=None) -async def health_temporal(request: Request) -> dict[str, Any] | JSONResponse: - """Temporal-specific health check.""" - app_deps: ApplicationDependencies = request.app.state.app_dependencies - - if not app_deps.temporal_service.is_enabled: - return { - "status": "disabled", - "note": "Temporal service is not enabled", - } - - try: - healthy = await app_deps.temporal_service.health_check() - - return { - "status": "healthy" if healthy else "unhealthy", - "url": app_deps.temporal_service.url, - "namespace": app_deps.temporal_service.namespace, - "task_queue": app_deps.temporal_service.task_queue, - } - except Exception as e: + if result.status == ServiceStatus.UNHEALTHY: return JSONResponse( status_code=503, - content={ - "status": "unhealthy", - "error": str(e), - "error_type": type(e).__name__, - }, + content=result.model_dump(mode="json"), ) + + return result diff --git a/src/app/api/http/routers/workflows.py b/src/app/api/http/routers/workflows.py index 84bc635..78dd23c 100644 --- a/src/app/api/http/routers/workflows.py +++ b/src/app/api/http/routers/workflows.py @@ -1,64 +1,159 @@ -# api/routes/workflows.py +"""Temporal workflow management endpoints. + +This module provides HTTP endpoints for managing Temporal workflows, +including starting workflows, sending signals, and querying workflow state. + +Endpoint Summary: + POST /workflows/start - Start a new workflow + POST /workflows/{id}/signal/{name} - Send signal to a running workflow + GET /workflows/{id} - Query workflow state +""" + +from __future__ import annotations + import uuid -from typing import Any from fastapi import APIRouter, Depends, HTTPException -from pydantic import BaseModel from src.app.api.http.deps import get_temporal_service +from src.app.api.http.schemas.workflows import ( + WorkflowNotFoundError, + WorkflowQueryResponse, + WorkflowSignalRequest, + WorkflowSignalResponse, + WorkflowStartRequest, + WorkflowStartResponse, +) from src.app.core.services import TemporalClientService -router = APIRouter(prefix="/workflows") +router = APIRouter(prefix="/workflows", tags=["workflows"]) -class StartRequest(BaseModel): - workflow: str # e.g., "OrderWorkflow" - args: list[Any] = [] - kwargs: dict[str, Any] = {} - id: str | None = None - task_queue: str = "app" +@router.post( + "/start", + response_model=WorkflowStartResponse, + responses={ + 404: { + "description": "Workflow type not found", + "model": WorkflowNotFoundError, + }, + }, + summary="Start a new workflow", + description="Start a new Temporal workflow execution with the specified parameters.", +) +async def start_workflow( + request: WorkflowStartRequest, + client_service: TemporalClientService = Depends(get_temporal_service), +) -> WorkflowStartResponse: + """Start a new Temporal workflow. + Looks up the workflow class by name from the worker.workflows module + and starts a new execution with the provided arguments. + + Args: + request: Workflow start parameters including workflow name and arguments + + Returns: + WorkflowStartResponse with the workflow ID and run ID + + Raises: + HTTPException: 404 if the workflow type is not found + """ + # Dynamic import of workflow class + try: + workflow_module = __import__("worker.workflows", fromlist=[request.workflow]) + workflow_class = getattr(workflow_module, request.workflow, None) + except ImportError: + workflow_class = None + + if not workflow_class: + raise HTTPException( + status_code=404, + detail=f"Workflow type '{request.workflow}' not found", + ) -@router.post("/start") -async def start( - req: StartRequest, - client_service: TemporalClientService = Depends(get_temporal_service), -) -> dict[str, str | None]: - wf = getattr( - __import__("worker.workflows", fromlist=[req.workflow]), req.workflow, None - ) client = await client_service.get_client() - if not wf: - raise HTTPException(404, "Workflow type not found") + + # Generate workflow ID if not provided + workflow_id = request.id or f"{request.workflow.lower()}-{uuid.uuid4()}" handle = await client.start_workflow( - wf.run, - *req.args, - **req.kwargs, - id=req.id or f"{req.workflow.lower()}-{uuid.uuid4()}", - task_queue=req.task_queue, + workflow_class.run, + *request.args, + **request.kwargs, + id=workflow_id, + task_queue=request.task_queue, + ) + + return WorkflowStartResponse( + workflow_id=handle.id, + run_id=handle.first_execution_run_id, ) - return {"workflow_id": handle.id, "run_id": handle.first_execution_run_id} -@router.post("/{workflow_id}/signal/{signal_name}") -async def signal( +@router.post( + "/{workflow_id}/signal/{signal_name}", + response_model=WorkflowSignalResponse, + summary="Signal a running workflow", + description="Send a signal to a running workflow to trigger state changes or actions.", +) +async def signal_workflow( workflow_id: str, signal_name: str, - payload: dict[str, Any], + request: WorkflowSignalRequest, client_service: TemporalClientService = Depends(get_temporal_service), -) -> dict[str, bool]: +) -> WorkflowSignalResponse: + """Send a signal to a running workflow. + + Signals are async messages that workflows can handle to trigger + state changes, continue waiting operations, or perform actions. + + Args: + workflow_id: ID of the running workflow + signal_name: Name of the signal to send + request: Signal payload + + Returns: + WorkflowSignalResponse indicating success + """ client = await client_service.get_client() - h = client.get_workflow_handle(workflow_id) - await h.signal(signal_name, **payload) - return {"ok": True} + handle = client.get_workflow_handle(workflow_id) + + await handle.signal(signal_name, **request.payload) + + return WorkflowSignalResponse( + success=True, + message=f"Signal '{signal_name}' sent to workflow '{workflow_id}'", + ) -@router.get("/{workflow_id}") -async def read( +@router.get( + "/{workflow_id}", + response_model=WorkflowQueryResponse, + summary="Query workflow state", + description="Query the current state of a workflow using the 'state' query handler.", +) +async def query_workflow( workflow_id: str, client_service: TemporalClientService = Depends(get_temporal_service), -) -> Any: +) -> WorkflowQueryResponse: + """Query a workflow's current state. + + Uses the 'state' query handler defined in the workflow to retrieve + the current workflow state. + + Args: + workflow_id: ID of the workflow to query + + Returns: + WorkflowQueryResponse with the workflow state + """ client = await client_service.get_client() - h = client.get_workflow_handle(workflow_id) - return await h.query("state") + handle = client.get_workflow_handle(workflow_id) + + state = await handle.query("state") + + return WorkflowQueryResponse( + state=state, + workflow_id=workflow_id, + ) diff --git a/src/app/api/http/schemas/__init__.py b/src/app/api/http/schemas/__init__.py index e69de29..465162a 100644 --- a/src/app/api/http/schemas/__init__.py +++ b/src/app/api/http/schemas/__init__.py @@ -0,0 +1,61 @@ +"""API schema definitions for HTTP endpoints. + +This package contains Pydantic models for request/response schemas +used by the HTTP API layer. + +Modules: + health: Health check response models + workflows: Temporal workflow request/response models +""" + +from src.app.api.http.schemas.health import ( + AllServicesHealth, + DatabaseHealth, + DatabaseHealthDetailed, + HealthCheckError, + LivenessResponse, + OIDCProviderHealth, + OverallStatus, + ReadinessResponse, + RedisHealth, + RedisHealthDetailed, + ServiceHealthBase, + ServiceStatus, + TemporalHealth, + TemporalHealthDetailed, +) +from src.app.api.http.schemas.workflows import ( + WorkflowExecutionError, + WorkflowNotFoundError, + WorkflowQueryResponse, + WorkflowSignalRequest, + WorkflowSignalResponse, + WorkflowStartRequest, + WorkflowStartResponse, +) + +__all__ = [ + # Health schemas + "ServiceStatus", + "OverallStatus", + "ServiceHealthBase", + "DatabaseHealth", + "DatabaseHealthDetailed", + "RedisHealth", + "RedisHealthDetailed", + "TemporalHealth", + "TemporalHealthDetailed", + "OIDCProviderHealth", + "AllServicesHealth", + "ReadinessResponse", + "LivenessResponse", + "HealthCheckError", + # Workflow schemas + "WorkflowStartRequest", + "WorkflowStartResponse", + "WorkflowSignalRequest", + "WorkflowSignalResponse", + "WorkflowQueryResponse", + "WorkflowNotFoundError", + "WorkflowExecutionError", +] diff --git a/src/app/api/http/schemas/health.py b/src/app/api/http/schemas/health.py new file mode 100644 index 0000000..6c793ff --- /dev/null +++ b/src/app/api/http/schemas/health.py @@ -0,0 +1,237 @@ +"""Health check response schemas. + +This module defines the Pydantic models for health check API responses, +providing a consistent and well-documented interface for health endpoints. + +Status Terminology: + - healthy: Service is fully operational + - unhealthy: Service is not operational (critical failure) + - degraded: Service is partially operational or using fallback + - disabled: Service is intentionally not enabled in configuration +""" + +from __future__ import annotations + +from enum import Enum +from typing import Annotated + +from pydantic import BaseModel, ConfigDict, Field + + +class ServiceStatus(str, Enum): + """Standard status values for service health checks. + + These statuses provide a consistent vocabulary across all health + check responses: + + - HEALTHY: Service is fully operational and responding normally + - UNHEALTHY: Service is not operational (connection failed, errors, etc.) + - DEGRADED: Service is partially operational or using a fallback mechanism + - DISABLED: Service is intentionally not enabled in the configuration + """ + + HEALTHY = "healthy" + UNHEALTHY = "unhealthy" + DEGRADED = "degraded" + DISABLED = "disabled" + + +class OverallStatus(str, Enum): + """Overall application readiness status. + + - READY: All critical services are operational + - NOT_READY: One or more critical services are not operational + """ + + READY = "ready" + NOT_READY = "not_ready" + + +# ============================================================================= +# Base Models +# ============================================================================= + + +class ServiceHealthBase(BaseModel): + """Base model for individual service health check results.""" + + model_config = ConfigDict(use_enum_values=True) + + status: ServiceStatus = Field(description="Current health status of the service") + note: str | None = Field( + default=None, + description="Optional additional context about the status", + ) + error: str | None = Field( + default=None, + description="Error message if status is unhealthy", + ) + + +# ============================================================================= +# Database Health Models +# ============================================================================= + + +class DatabaseHealth(ServiceHealthBase): + """Health check result for the database service.""" + + type: Annotated[ + str, + Field(description="Database type (postgresql, sqlite)"), + ] + + +class DatabaseHealthDetailed(DatabaseHealth): + """Detailed health check result for the database with pool info.""" + + pool: dict[str, int | str] | None = Field( + default=None, + description="Connection pool statistics", + ) + + +# ============================================================================= +# Redis Health Models +# ============================================================================= + + +class RedisHealth(ServiceHealthBase): + """Health check result for the Redis service.""" + + type: Annotated[ + str, + Field(description="Storage type (redis, in-memory)"), + ] + + +class RedisHealthDetailed(RedisHealth): + """Detailed health check result for Redis with server info.""" + + url: str | None = Field( + default=None, + description="Redis connection URL (masked)", + ) + info: dict[str, str | int] | None = Field( + default=None, + description="Redis server information", + ) + fallback: str | None = Field( + default=None, + description="Fallback mechanism being used if Redis is unavailable", + ) + + +# ============================================================================= +# Temporal Health Models +# ============================================================================= + + +class TemporalHealth(ServiceHealthBase): + """Health check result for the Temporal workflow service.""" + + url: str | None = Field( + default=None, + description="Temporal server URL", + ) + namespace: str | None = Field( + default=None, + description="Temporal namespace", + ) + + +class TemporalHealthDetailed(TemporalHealth): + """Detailed health check result for Temporal.""" + + task_queue: str | None = Field( + default=None, + description="Temporal task queue name", + ) + + +# ============================================================================= +# OIDC Provider Health Models +# ============================================================================= + + +class OIDCProviderHealth(ServiceHealthBase): + """Health check result for an individual OIDC provider.""" + + issuer: str = Field(description="OIDC issuer URL") + + +class OIDCProvidersHealth(BaseModel): + """Health check results for all configured OIDC providers.""" + + model_config = ConfigDict(extra="allow") + + # Dynamic fields for each provider (e.g., google, microsoft, keycloak) + # Using extra="allow" to handle dynamic provider names + + +# ============================================================================= +# Aggregate Health Check Models +# ============================================================================= + + +class AllServicesHealth(BaseModel): + """Aggregated health check results for all services.""" + + model_config = ConfigDict(use_enum_values=True) + + database: DatabaseHealth + redis: RedisHealth + temporal: TemporalHealth + oidc_providers: dict[str, OIDCProviderHealth] | None = Field( + default=None, + description="Health status of configured OIDC providers", + ) + + +class ReadinessResponse(BaseModel): + """Response model for the /health/ready endpoint. + + This is the primary health check endpoint used by Kubernetes + readiness probes to determine if the application is ready + to receive traffic. + """ + + model_config = ConfigDict(use_enum_values=True) + + status: OverallStatus = Field(description="Overall application readiness status") + environment: str = Field( + description="Current deployment environment (development, production, etc.)" + ) + checks: AllServicesHealth = Field( + description="Individual service health check results" + ) + + +class LivenessResponse(BaseModel): + """Response model for the /health endpoint (liveness probe). + + This is a simple health check that only verifies the application + process is running. It does not check dependencies. + """ + + status: Annotated[ + str, + Field(description="Always 'healthy' if the app is running"), + ] = "healthy" + service: Annotated[ + str, + Field(description="Service identifier"), + ] = "api" + + +# ============================================================================= +# Error Response Models +# ============================================================================= + + +class HealthCheckError(BaseModel): + """Error response for health check endpoints.""" + + status: ServiceStatus = ServiceStatus.UNHEALTHY + error: str = Field(description="Error message") + error_type: str = Field(description="Exception class name") diff --git a/src/app/api/http/schemas/workflows.py b/src/app/api/http/schemas/workflows.py new file mode 100644 index 0000000..f77adb4 --- /dev/null +++ b/src/app/api/http/schemas/workflows.py @@ -0,0 +1,128 @@ +"""Pydantic schemas for workflow API endpoints. + +This module defines request and response models for the Temporal workflow +management API endpoints. +""" + +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + +# ============================================================================= +# Request Models +# ============================================================================= + + +class WorkflowStartRequest(BaseModel): + """Request model for starting a new workflow. + + Example: + ```json + { + "workflow": "OrderWorkflow", + "args": [{"order_id": "123"}], + "task_queue": "orders" + } + ``` + """ + + workflow: str = Field( + description="Name of the workflow class to execute (e.g., 'OrderWorkflow')" + ) + args: list[Any] = Field( + default_factory=list, + description="Positional arguments to pass to the workflow", + ) + kwargs: dict[str, Any] = Field( + default_factory=dict, + description="Keyword arguments to pass to the workflow", + ) + id: str | None = Field( + default=None, + description="Custom workflow ID. If not provided, one is auto-generated.", + ) + task_queue: str = Field( + default="app", + description="Temporal task queue to use for the workflow", + ) + + +class WorkflowSignalRequest(BaseModel): + """Request model for signaling a workflow. + + Signals are async messages that can be sent to running workflows + to trigger state changes or actions. + """ + + payload: dict[str, Any] = Field( + default_factory=dict, + description="Data payload to send with the signal", + ) + + +# ============================================================================= +# Response Models +# ============================================================================= + + +class WorkflowStartResponse(BaseModel): + """Response model for a successfully started workflow.""" + + workflow_id: str = Field(description="Unique identifier for the workflow") + run_id: str | None = Field( + default=None, + description="Temporal run ID for this workflow execution", + ) + + +class WorkflowSignalResponse(BaseModel): + """Response model for a successful signal operation.""" + + success: bool = Field( + default=True, + description="Whether the signal was sent successfully", + ) + message: str = Field( + default="Signal sent", + description="Status message", + ) + + +class WorkflowQueryResponse(BaseModel): + """Response model for a workflow query. + + The actual structure depends on the workflow's query handler, + but this provides a typed wrapper. + """ + + state: Any = Field(description="Current workflow state from the query") + workflow_id: str | None = Field( + default=None, + description="Workflow ID that was queried", + ) + + +class WorkflowNotFoundError(BaseModel): + """Error response when a workflow type is not found.""" + + detail: str = Field(description="Error message describing what went wrong") + available_workflows: list[str] | None = Field( + default=None, + description="List of available workflow types", + ) + + +class WorkflowExecutionError(BaseModel): + """Error response for workflow execution failures.""" + + detail: str = Field(description="Error message") + workflow_id: str | None = Field( + default=None, + description="Workflow ID if available", + ) + error_type: str | None = Field( + default=None, + description="Type of error that occurred", + ) diff --git a/src/app/core/services/health_service.py b/src/app/core/services/health_service.py new file mode 100644 index 0000000..3630562 --- /dev/null +++ b/src/app/core/services/health_service.py @@ -0,0 +1,350 @@ +"""Health check service for application dependencies. + +This module provides a centralized service for performing health checks +on all application dependencies (database, Redis, Temporal, OIDC providers). + +The service abstracts health check logic from the HTTP layer, making it +testable and reusable across different contexts (HTTP endpoints, CLI tools, +background workers, etc.). +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from src.app.api.http.schemas.health import ( + AllServicesHealth, + DatabaseHealth, + OIDCProviderHealth, + OverallStatus, + ReadinessResponse, + RedisHealth, + RedisHealthDetailed, + ServiceStatus, + TemporalHealth, + TemporalHealthDetailed, +) + +if TYPE_CHECKING: + from src.app.api.http.app_data import ApplicationDependencies + from src.app.runtime.config.config_data import ConfigData + + +@dataclass +class HealthCheckResult: + """Result of an individual health check. + + Attributes: + status: The health status of the service + details: Additional details about the check (type, url, etc.) + is_critical: Whether this service failure should mark the app as not ready + """ + + status: ServiceStatus + details: dict[str, str | int | bool | None] + is_critical: bool = True + + +class HealthCheckService: + """Service for performing health checks on application dependencies. + + This service encapsulates all health check logic and provides a clean + interface for checking individual services or all services at once. + + Example: + ```python + health_service = HealthCheckService(app_deps, config) + result = await health_service.check_all() + if result.status == OverallStatus.READY: + print("All systems go!") + ``` + """ + + def __init__( + self, + app_deps: ApplicationDependencies, + config: ConfigData, + ) -> None: + """Initialize the health check service. + + Args: + app_deps: Application dependencies container + config: Application configuration + """ + self._app_deps = app_deps + self._config = config + + # ========================================================================= + # Public API + # ========================================================================= + + async def check_all(self) -> ReadinessResponse: + """Perform health checks on all services. + + Returns: + ReadinessResponse with status of all services + """ + database = await self.check_database() + redis = await self.check_redis() + temporal = await self.check_temporal() + oidc_providers = await self.check_oidc_providers() + + # Determine overall status based on critical services + all_healthy = self._evaluate_overall_health( + database=database, + redis=redis, + temporal=temporal, + oidc_providers=oidc_providers, + ) + + return ReadinessResponse( + status=OverallStatus.READY if all_healthy else OverallStatus.NOT_READY, + environment=self._config.app.environment, + checks=AllServicesHealth( + database=database, + redis=redis, + temporal=temporal, + oidc_providers=oidc_providers if oidc_providers else None, + ), + ) + + async def check_database(self) -> DatabaseHealth: + """Check database connectivity. + + Returns: + DatabaseHealth with status and database type + """ + db_type = self._get_database_type() + + try: + healthy = self._app_deps.database_service.health_check() + return DatabaseHealth( + status=ServiceStatus.HEALTHY if healthy else ServiceStatus.UNHEALTHY, + type=db_type, + ) + except Exception as e: + return DatabaseHealth( + status=ServiceStatus.UNHEALTHY, + type=db_type, + error=str(e), + ) + + async def check_redis(self) -> RedisHealth: + """Check Redis connectivity. + + Redis is a non-critical service - if unavailable, the application + falls back to in-memory storage. + + Returns: + RedisHealth with status and storage type + """ + if not self._config.redis.enabled: + return RedisHealth( + status=ServiceStatus.DISABLED, + type="in-memory", + note="Redis is not enabled, using in-memory storage", + ) + + try: + if self._app_deps.redis_service: + healthy = await self._app_deps.redis_service.health_check() + return RedisHealth( + status=ServiceStatus.HEALTHY if healthy else ServiceStatus.DEGRADED, + type="redis" if healthy else "in-memory", + note=None if healthy else "Falling back to in-memory storage", + ) + else: + return RedisHealth( + status=ServiceStatus.DEGRADED, + type="in-memory", + note="Redis service not initialized, using in-memory storage", + ) + except Exception as e: + return RedisHealth( + status=ServiceStatus.DEGRADED, + type="in-memory", + note="Using in-memory storage due to Redis error", + error=str(e), + ) + + async def check_redis_detailed(self) -> RedisHealthDetailed: + """Check Redis with detailed server information. + + Returns: + RedisHealthDetailed with server info if available + """ + if not self._config.redis.enabled: + return RedisHealthDetailed( + status=ServiceStatus.DISABLED, + type="in-memory", + note="Redis is not enabled, using in-memory storage", + ) + + try: + if self._app_deps.redis_service: + healthy = await self._app_deps.redis_service.health_check() + info = ( + await self._app_deps.redis_service.get_info() if healthy else None + ) + + return RedisHealthDetailed( + status=ServiceStatus.HEALTHY + if healthy + else ServiceStatus.UNHEALTHY, + type="redis", + url=self._config.redis.url, + info=info, + ) + else: + return RedisHealthDetailed( + status=ServiceStatus.DEGRADED, + type="in-memory", + note="Redis service not initialized", + fallback="in-memory storage", + ) + except Exception as e: + return RedisHealthDetailed( + status=ServiceStatus.UNHEALTHY, + type="redis", + error=str(e), + fallback="in-memory storage", + ) + + async def check_temporal(self) -> TemporalHealth: + """Check Temporal workflow service connectivity. + + Returns: + TemporalHealth with status and connection details + """ + if not self._config.temporal.enabled: + return TemporalHealth( + status=ServiceStatus.DISABLED, + note="Temporal service is not enabled", + ) + + try: + healthy = await self._app_deps.temporal_service.health_check() + return TemporalHealth( + status=ServiceStatus.HEALTHY if healthy else ServiceStatus.UNHEALTHY, + url=self._app_deps.temporal_service.url, + namespace=self._app_deps.temporal_service.namespace, + ) + except Exception as e: + return TemporalHealth( + status=ServiceStatus.UNHEALTHY, + error=str(e), + ) + + async def check_temporal_detailed(self) -> TemporalHealthDetailed: + """Check Temporal with detailed configuration info. + + Returns: + TemporalHealthDetailed with task queue info + """ + if not self._config.temporal.enabled: + return TemporalHealthDetailed( + status=ServiceStatus.DISABLED, + note="Temporal service is not enabled", + ) + + try: + healthy = await self._app_deps.temporal_service.health_check() + return TemporalHealthDetailed( + status=ServiceStatus.HEALTHY if healthy else ServiceStatus.UNHEALTHY, + url=self._app_deps.temporal_service.url, + namespace=self._app_deps.temporal_service.namespace, + task_queue=self._app_deps.temporal_service.task_queue, + ) + except Exception as e: + return TemporalHealthDetailed( + status=ServiceStatus.UNHEALTHY, + error=str(e), + ) + + async def check_oidc_providers(self) -> dict[str, OIDCProviderHealth] | None: + """Check all configured OIDC providers. + + Returns: + Dict of provider name to health status, or None if no providers configured + """ + if not self._config.oidc.providers: + return None + + results: dict[str, OIDCProviderHealth] = {} + jwks_service = self._app_deps.jwks_service + + for provider_name, provider_config in self._config.oidc.providers.items(): + try: + # Verify provider is reachable by fetching JWKS + await jwks_service.fetch_jwks(provider_config) + results[provider_name] = OIDCProviderHealth( + status=ServiceStatus.HEALTHY, + issuer=provider_config.issuer, + ) + except Exception as e: + results[provider_name] = OIDCProviderHealth( + status=ServiceStatus.UNHEALTHY, + issuer=provider_config.issuer, + error=str(e), + ) + + return results if results else None + + # ========================================================================= + # Private Helpers + # ========================================================================= + + def _get_database_type(self) -> str: + """Determine database type from connection URL.""" + if "postgresql" in self._config.database.url: + return "postgresql" + elif "sqlite" in self._config.database.url: + return "sqlite" + else: + return "unknown" + + def _evaluate_overall_health( + self, + database: DatabaseHealth, + redis: RedisHealth, + temporal: TemporalHealth, + oidc_providers: dict[str, OIDCProviderHealth] | None, + ) -> bool: + """Evaluate overall health based on individual service checks. + + Critical services: + - Database: Always critical + - Temporal: Critical only if enabled + - OIDC: Critical only in production + + Non-critical services: + - Redis: Falls back to in-memory, never critical + + Args: + database: Database health check result + redis: Redis health check result + temporal: Temporal health check result + oidc_providers: OIDC provider health check results + + Returns: + True if all critical services are healthy + """ + # Database is always critical + if database.status == ServiceStatus.UNHEALTHY: + return False + + # Temporal is critical only if enabled + if temporal.status == ServiceStatus.UNHEALTHY: + return False + + # OIDC is critical only in production + if oidc_providers and self._config.app.environment == "production": + for provider in oidc_providers.values(): + if provider.status == ServiceStatus.UNHEALTHY: + return False + + # Redis is never critical (falls back to in-memory) + # So we don't check redis.status here + + return True diff --git a/tests/unit/app/api/http/routers/test_health_endpoints.py b/tests/unit/app/api/http/routers/test_health_endpoints.py new file mode 100644 index 0000000..086eeef --- /dev/null +++ b/tests/unit/app/api/http/routers/test_health_endpoints.py @@ -0,0 +1,316 @@ +"""Unit tests for health check API endpoints. + +These tests verify the health router behavior with mocked dependencies. +""" + +from __future__ import annotations + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from src.app.api.http.routers.health import get_health_service, router +from src.app.api.http.schemas.health import ( + AllServicesHealth, + DatabaseHealth, + LivenessResponse, + OverallStatus, + ReadinessResponse, + RedisHealth, + RedisHealthDetailed, + ServiceStatus, + TemporalHealth, + TemporalHealthDetailed, +) +from src.app.core.services.health_service import HealthCheckService + + +@pytest.fixture +def mock_config() -> MagicMock: + """Create a mock configuration.""" + config = MagicMock() + config.app.environment = "development" + config.database.url = "sqlite:///test.db" + config.redis.enabled = True + config.redis.url = "redis://localhost:6379" + config.temporal.enabled = False + config.oidc.providers = {} + return config + + +@pytest.fixture +def mock_app_deps() -> MagicMock: + """Create mock application dependencies.""" + deps = MagicMock() + deps.database_service.health_check.return_value = True + deps.database_service.get_pool_status.return_value = {"size": 5, "in_use": 1} + deps.redis_service = AsyncMock() + deps.redis_service.health_check = AsyncMock(return_value=True) + deps.redis_service.get_info = AsyncMock( + return_value={"version": "7.0.0", "connected_clients": 5} + ) + deps.temporal_service.is_enabled = False + deps.temporal_service.health_check = AsyncMock(return_value=True) + deps.temporal_service.url = "localhost:7233" + deps.temporal_service.namespace = "default" + deps.temporal_service.task_queue = "app" + deps.jwks_service = AsyncMock() + return deps + + +@pytest.fixture +def mock_health_service(mock_app_deps: MagicMock, mock_config: MagicMock) -> MagicMock: + """Create a mock health check service.""" + service = MagicMock(spec=HealthCheckService) + + # Configure check_all to return a successful response + service.check_all = AsyncMock( + return_value=ReadinessResponse( + status=OverallStatus.READY, + environment="development", + checks=AllServicesHealth( + database=DatabaseHealth(status=ServiceStatus.HEALTHY, type="sqlite"), + redis=RedisHealth(status=ServiceStatus.DISABLED, type="in-memory"), + temporal=TemporalHealth(status=ServiceStatus.DISABLED), + ), + ) + ) + + service.check_redis_detailed = AsyncMock( + return_value=RedisHealthDetailed( + status=ServiceStatus.HEALTHY, + type="redis", + url="redis://localhost:6379", + info={"version": "7.0.0"}, + ) + ) + + service.check_temporal_detailed = AsyncMock( + return_value=TemporalHealthDetailed( + status=ServiceStatus.DISABLED, + note="Temporal service is not enabled", + ) + ) + + return service + + +@pytest.fixture +def test_app(mock_health_service: MagicMock) -> FastAPI: + """Create test FastAPI application.""" + app = FastAPI() + app.include_router(router) + + # Override the health service dependency + app.dependency_overrides[get_health_service] = lambda: mock_health_service + + return app + + +@pytest.fixture +def client(test_app: FastAPI) -> TestClient: + """Create test client.""" + return TestClient(test_app) + + +class TestLivenessEndpoint: + """Tests for the /health liveness probe endpoint.""" + + def test_health_returns_healthy(self, client: TestClient) -> None: + """Test basic health check returns healthy status.""" + response = client.get("/health") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "healthy" + assert data["service"] == "api" + + def test_health_response_model(self, client: TestClient) -> None: + """Test health response matches LivenessResponse model.""" + response = client.get("/health") + data = response.json() + + # Validate against model + liveness = LivenessResponse(**data) + assert liveness.status == "healthy" + assert liveness.service == "api" + + +class TestReadinessEndpoint: + """Tests for the /health/ready readiness probe endpoint.""" + + def test_readiness_returns_ready( + self, client: TestClient, mock_health_service: MagicMock + ) -> None: + """Test readiness check returns ready when all services healthy.""" + response = client.get("/health/ready") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "ready" + assert data["environment"] == "development" + assert "checks" in data + + def test_readiness_returns_503_when_not_ready( + self, client: TestClient, mock_health_service: MagicMock + ) -> None: + """Test readiness check returns 503 when critical services fail.""" + # Configure service to return NOT_READY + mock_health_service.check_all = AsyncMock( + return_value=ReadinessResponse( + status=OverallStatus.NOT_READY, + environment="development", + checks=AllServicesHealth( + database=DatabaseHealth( + status=ServiceStatus.UNHEALTHY, + type="postgresql", + error="Connection refused", + ), + redis=RedisHealth(status=ServiceStatus.HEALTHY, type="redis"), + temporal=TemporalHealth(status=ServiceStatus.DISABLED), + ), + ) + ) + + response = client.get("/health/ready") + + assert response.status_code == 503 + data = response.json() + assert data["status"] == "not_ready" + assert data["checks"]["database"]["status"] == "unhealthy" + + def test_readiness_includes_all_service_checks( + self, client: TestClient, mock_health_service: MagicMock + ) -> None: + """Test readiness response includes all expected service checks.""" + response = client.get("/health/ready") + data = response.json() + + checks = data["checks"] + assert "database" in checks + assert "redis" in checks + assert "temporal" in checks + + +class TestRedisHealthEndpoint: + """Tests for the /health/redis endpoint.""" + + def test_redis_healthy( + self, client: TestClient, mock_health_service: MagicMock + ) -> None: + """Test Redis health check returns healthy status.""" + response = client.get("/health/redis") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "healthy" + assert data["type"] == "redis" + + def test_redis_unhealthy_returns_503( + self, client: TestClient, mock_health_service: MagicMock + ) -> None: + """Test Redis health check returns 503 when unhealthy.""" + mock_health_service.check_redis_detailed = AsyncMock( + return_value=RedisHealthDetailed( + status=ServiceStatus.UNHEALTHY, + type="redis", + error="Connection refused", + ) + ) + + response = client.get("/health/redis") + + assert response.status_code == 503 + data = response.json() + assert data["status"] == "unhealthy" + + def test_redis_disabled( + self, client: TestClient, mock_health_service: MagicMock + ) -> None: + """Test Redis health check shows disabled status.""" + mock_health_service.check_redis_detailed = AsyncMock( + return_value=RedisHealthDetailed( + status=ServiceStatus.DISABLED, + type="in-memory", + note="Redis is not enabled, using in-memory storage", + ) + ) + + response = client.get("/health/redis") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "disabled" + assert data["type"] == "in-memory" + + +class TestTemporalHealthEndpoint: + """Tests for the /health/temporal endpoint.""" + + def test_temporal_disabled( + self, client: TestClient, mock_health_service: MagicMock + ) -> None: + """Test Temporal health check shows disabled status.""" + response = client.get("/health/temporal") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "disabled" + + def test_temporal_healthy( + self, client: TestClient, mock_health_service: MagicMock + ) -> None: + """Test Temporal health check returns healthy when enabled.""" + mock_health_service.check_temporal_detailed = AsyncMock( + return_value=TemporalHealthDetailed( + status=ServiceStatus.HEALTHY, + url="localhost:7233", + namespace="default", + task_queue="app", + ) + ) + + response = client.get("/health/temporal") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "healthy" + assert data["url"] == "localhost:7233" + + def test_temporal_unhealthy_returns_503( + self, client: TestClient, mock_health_service: MagicMock + ) -> None: + """Test Temporal health check returns 503 when unhealthy.""" + mock_health_service.check_temporal_detailed = AsyncMock( + return_value=TemporalHealthDetailed( + status=ServiceStatus.UNHEALTHY, + error="Connection timeout", + ) + ) + + response = client.get("/health/temporal") + + assert response.status_code == 503 + data = response.json() + assert data["status"] == "unhealthy" + + +class TestServiceStatusEnum: + """Tests for the ServiceStatus enum values.""" + + def test_enum_values(self) -> None: + """Test ServiceStatus enum has expected values.""" + assert ServiceStatus.HEALTHY.value == "healthy" + assert ServiceStatus.UNHEALTHY.value == "unhealthy" + assert ServiceStatus.DEGRADED.value == "degraded" + assert ServiceStatus.DISABLED.value == "disabled" + + def test_enum_serialization(self) -> None: + """Test ServiceStatus serializes correctly in Pydantic models.""" + health = DatabaseHealth(status=ServiceStatus.HEALTHY, type="sqlite") + data = health.model_dump() + + # With use_enum_values=True, should serialize as string + assert data["status"] == "healthy" diff --git a/tests/unit/app/core/services/test_health_service.py b/tests/unit/app/core/services/test_health_service.py new file mode 100644 index 0000000..c3b6a47 --- /dev/null +++ b/tests/unit/app/core/services/test_health_service.py @@ -0,0 +1,378 @@ +"""Unit tests for the HealthCheckService. + +These tests verify the health check service logic with mocked dependencies. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from src.app.api.http.schemas.health import ( + OverallStatus, + ServiceStatus, +) +from src.app.core.services.health_service import HealthCheckService + +if TYPE_CHECKING: + pass + + +@pytest.fixture +def mock_config() -> MagicMock: + """Create a mock configuration.""" + config = MagicMock() + config.app.environment = "development" + config.database.url = "postgresql://localhost:5432/test" + config.redis.enabled = True + config.redis.url = "redis://localhost:6379" + config.temporal.enabled = True + config.oidc.providers = {} + return config + + +@pytest.fixture +def mock_app_deps() -> MagicMock: + """Create mock application dependencies.""" + deps = MagicMock() + + # Database service mocks + deps.database_service.health_check.return_value = True + deps.database_service.get_pool_status.return_value = {"size": 5, "in_use": 1} + + # Redis service mocks + deps.redis_service = AsyncMock() + deps.redis_service.health_check = AsyncMock(return_value=True) + deps.redis_service.get_info = AsyncMock( + return_value={"version": "7.0.0", "connected_clients": 5} + ) + + # Temporal service mocks + deps.temporal_service = MagicMock() + deps.temporal_service.health_check = AsyncMock(return_value=True) + deps.temporal_service.url = "localhost:7233" + deps.temporal_service.namespace = "default" + deps.temporal_service.task_queue = "app" + + # JWKS service for OIDC checks + deps.jwks_service = AsyncMock() + + return deps + + +@pytest.fixture +def health_service( + mock_app_deps: MagicMock, mock_config: MagicMock +) -> HealthCheckService: + """Create a HealthCheckService instance with mocked dependencies.""" + return HealthCheckService(mock_app_deps, mock_config) + + +class TestCheckAll: + """Tests for the check_all method.""" + + async def test_all_services_healthy( + self, health_service: HealthCheckService + ) -> None: + """Test check_all returns READY when all services are healthy.""" + result = await health_service.check_all() + + assert result.status == OverallStatus.READY + assert result.checks.database.status == ServiceStatus.HEALTHY + assert result.checks.redis.status == ServiceStatus.HEALTHY + assert result.checks.temporal.status == ServiceStatus.HEALTHY + + async def test_database_failure_makes_not_ready( + self, + mock_app_deps: MagicMock, + mock_config: MagicMock, + ) -> None: + """Test check_all returns NOT_READY when database fails.""" + mock_app_deps.database_service.health_check.return_value = False + service = HealthCheckService(mock_app_deps, mock_config) + + result = await service.check_all() + + assert result.status == OverallStatus.NOT_READY + assert result.checks.database.status == ServiceStatus.UNHEALTHY + + async def test_redis_failure_still_ready( + self, + mock_app_deps: MagicMock, + mock_config: MagicMock, + ) -> None: + """Test check_all returns READY even when Redis fails (non-critical).""" + mock_app_deps.redis_service.health_check = AsyncMock(return_value=False) + service = HealthCheckService(mock_app_deps, mock_config) + + result = await service.check_all() + + # Redis is non-critical, so should still be READY + assert result.status == OverallStatus.READY + assert result.checks.redis.status == ServiceStatus.DEGRADED + + async def test_temporal_failure_makes_not_ready( + self, + mock_app_deps: MagicMock, + mock_config: MagicMock, + ) -> None: + """Test check_all returns NOT_READY when Temporal fails (if enabled).""" + mock_app_deps.temporal_service.health_check = AsyncMock(return_value=False) + service = HealthCheckService(mock_app_deps, mock_config) + + result = await service.check_all() + + assert result.status == OverallStatus.NOT_READY + assert result.checks.temporal.status == ServiceStatus.UNHEALTHY + + +class TestCheckDatabase: + """Tests for the check_database method.""" + + async def test_healthy_postgresql(self, health_service: HealthCheckService) -> None: + """Test database check returns healthy for PostgreSQL.""" + result = await health_service.check_database() + + assert result.status == ServiceStatus.HEALTHY + assert result.type == "postgresql" + assert result.error is None + + async def test_healthy_sqlite( + self, mock_app_deps: MagicMock, mock_config: MagicMock + ) -> None: + """Test database check returns healthy for SQLite.""" + mock_config.database.url = "sqlite:///test.db" + service = HealthCheckService(mock_app_deps, mock_config) + + result = await service.check_database() + + assert result.status == ServiceStatus.HEALTHY + assert result.type == "sqlite" + + async def test_database_exception( + self, mock_app_deps: MagicMock, mock_config: MagicMock + ) -> None: + """Test database check returns unhealthy on exception.""" + mock_app_deps.database_service.health_check.side_effect = Exception( + "Connection refused" + ) + service = HealthCheckService(mock_app_deps, mock_config) + + result = await service.check_database() + + assert result.status == ServiceStatus.UNHEALTHY + assert result.error == "Connection refused" + + +class TestCheckRedis: + """Tests for the check_redis method.""" + + async def test_redis_healthy(self, health_service: HealthCheckService) -> None: + """Test Redis check returns healthy when working.""" + result = await health_service.check_redis() + + assert result.status == ServiceStatus.HEALTHY + assert result.type == "redis" + + async def test_redis_disabled( + self, mock_app_deps: MagicMock, mock_config: MagicMock + ) -> None: + """Test Redis check returns disabled when not enabled.""" + mock_config.redis.enabled = False + service = HealthCheckService(mock_app_deps, mock_config) + + result = await service.check_redis() + + assert result.status == ServiceStatus.DISABLED + assert result.type == "in-memory" + assert "not enabled" in (result.note or "") + + async def test_redis_service_not_initialized( + self, mock_app_deps: MagicMock, mock_config: MagicMock + ) -> None: + """Test Redis check returns degraded when service not initialized.""" + mock_app_deps.redis_service = None + service = HealthCheckService(mock_app_deps, mock_config) + + result = await service.check_redis() + + assert result.status == ServiceStatus.DEGRADED + assert result.type == "in-memory" + + async def test_redis_exception( + self, mock_app_deps: MagicMock, mock_config: MagicMock + ) -> None: + """Test Redis check returns degraded on exception.""" + mock_app_deps.redis_service.health_check = AsyncMock( + side_effect=Exception("Connection timeout") + ) + service = HealthCheckService(mock_app_deps, mock_config) + + result = await service.check_redis() + + assert result.status == ServiceStatus.DEGRADED + assert "Connection timeout" in (result.error or "") + + +class TestCheckTemporal: + """Tests for the check_temporal method.""" + + async def test_temporal_healthy(self, health_service: HealthCheckService) -> None: + """Test Temporal check returns healthy when working.""" + result = await health_service.check_temporal() + + assert result.status == ServiceStatus.HEALTHY + assert result.url == "localhost:7233" + assert result.namespace == "default" + + async def test_temporal_disabled( + self, mock_app_deps: MagicMock, mock_config: MagicMock + ) -> None: + """Test Temporal check returns disabled when not enabled.""" + mock_config.temporal.enabled = False + service = HealthCheckService(mock_app_deps, mock_config) + + result = await service.check_temporal() + + assert result.status == ServiceStatus.DISABLED + assert "not enabled" in (result.note or "") + + async def test_temporal_exception( + self, mock_app_deps: MagicMock, mock_config: MagicMock + ) -> None: + """Test Temporal check returns unhealthy on exception.""" + mock_app_deps.temporal_service.health_check = AsyncMock( + side_effect=Exception("Service unavailable") + ) + service = HealthCheckService(mock_app_deps, mock_config) + + result = await service.check_temporal() + + assert result.status == ServiceStatus.UNHEALTHY + assert "Service unavailable" in (result.error or "") + + +class TestCheckTemporalDetailed: + """Tests for the check_temporal_detailed method.""" + + async def test_includes_task_queue( + self, health_service: HealthCheckService + ) -> None: + """Test detailed Temporal check includes task queue.""" + result = await health_service.check_temporal_detailed() + + assert result.status == ServiceStatus.HEALTHY + assert result.task_queue == "app" + + +class TestCheckRedisDetailed: + """Tests for the check_redis_detailed method.""" + + async def test_includes_server_info( + self, health_service: HealthCheckService + ) -> None: + """Test detailed Redis check includes server info.""" + result = await health_service.check_redis_detailed() + + assert result.status == ServiceStatus.HEALTHY + assert result.info is not None + assert result.info.get("version") == "7.0.0" + + +class TestCheckOIDCProviders: + """Tests for the check_oidc_providers method.""" + + async def test_no_providers_configured( + self, health_service: HealthCheckService + ) -> None: + """Test OIDC check returns None when no providers configured.""" + result = await health_service.check_oidc_providers() + + assert result is None + + async def test_provider_healthy( + self, mock_app_deps: MagicMock, mock_config: MagicMock + ) -> None: + """Test OIDC check returns healthy for working provider.""" + # Add a mock provider + mock_provider = MagicMock() + mock_provider.issuer = "https://accounts.google.com" + mock_config.oidc.providers = {"google": mock_provider} + + mock_app_deps.jwks_service.fetch_jwks = AsyncMock(return_value={}) + + service = HealthCheckService(mock_app_deps, mock_config) + result = await service.check_oidc_providers() + + assert result is not None + assert "google" in result + assert result["google"].status == ServiceStatus.HEALTHY + assert result["google"].issuer == "https://accounts.google.com" + + async def test_provider_unhealthy( + self, mock_app_deps: MagicMock, mock_config: MagicMock + ) -> None: + """Test OIDC check returns unhealthy for failing provider.""" + # Add a mock provider + mock_provider = MagicMock() + mock_provider.issuer = "https://accounts.google.com" + mock_config.oidc.providers = {"google": mock_provider} + + mock_app_deps.jwks_service.fetch_jwks = AsyncMock( + side_effect=Exception("JWKS fetch failed") + ) + + service = HealthCheckService(mock_app_deps, mock_config) + result = await service.check_oidc_providers() + + assert result is not None + assert "google" in result + assert result["google"].status == ServiceStatus.UNHEALTHY + assert "JWKS fetch failed" in (result["google"].error or "") + + +class TestOverallHealthEvaluation: + """Tests for the _evaluate_overall_health method.""" + + async def test_oidc_failure_critical_in_production( + self, mock_app_deps: MagicMock, mock_config: MagicMock + ) -> None: + """Test OIDC failure makes app NOT_READY in production.""" + mock_config.app.environment = "production" + + # Add a failing OIDC provider + mock_provider = MagicMock() + mock_provider.issuer = "https://accounts.google.com" + mock_config.oidc.providers = {"google": mock_provider} + + mock_app_deps.jwks_service.fetch_jwks = AsyncMock( + side_effect=Exception("JWKS fetch failed") + ) + + service = HealthCheckService(mock_app_deps, mock_config) + result = await service.check_all() + + assert result.status == OverallStatus.NOT_READY + + async def test_oidc_failure_not_critical_in_development( + self, mock_app_deps: MagicMock, mock_config: MagicMock + ) -> None: + """Test OIDC failure doesn't make app NOT_READY in development.""" + mock_config.app.environment = "development" + + # Add a failing OIDC provider + mock_provider = MagicMock() + mock_provider.issuer = "https://accounts.google.com" + mock_config.oidc.providers = {"google": mock_provider} + + mock_app_deps.jwks_service.fetch_jwks = AsyncMock( + side_effect=Exception("JWKS fetch failed") + ) + + service = HealthCheckService(mock_app_deps, mock_config) + result = await service.check_all() + + # In development, OIDC failures are not critical + assert result.status == OverallStatus.READY From 7a6cee12706a76cc13e3705da457c1aeb20658aa Mon Sep 17 00:00:00 2001 From: jc Date: Wed, 3 Dec 2025 22:06:59 -0500 Subject: [PATCH 3/5] checkpoint --- docs/fastapi-kubernetes-deployment.md | 36 +- docs/fastapi-kubernetes-ingress.md | 587 ++++++++++++++++++ infra/helm/api-forge/templates/ingress.yaml | 52 ++ infra/helm/api-forge/values.yaml | 84 ++- src/cli/deploy_commands.py | 18 + src/cli/deployment/helm_deployer/deployer.py | 12 +- .../deployment/helm_deployer/helm_release.py | 35 +- 7 files changed, 790 insertions(+), 34 deletions(-) create mode 100644 docs/fastapi-kubernetes-ingress.md create mode 100644 infra/helm/api-forge/templates/ingress.yaml diff --git a/docs/fastapi-kubernetes-deployment.md b/docs/fastapi-kubernetes-deployment.md index 8d49f46..108770d 100644 --- a/docs/fastapi-kubernetes-deployment.md +++ b/docs/fastapi-kubernetes-deployment.md @@ -863,35 +863,18 @@ spec: ### Ingress -Expose your FastAPI application via Ingress: +API Forge includes built-in Ingress support via CLI flags. Enable external access with: -```yaml -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: app-ingress - annotations: - cert-manager.io/cluster-issuer: letsencrypt-prod - nginx.ingress.kubernetes.io/ssl-redirect: "true" -spec: - ingressClassName: nginx - tls: - - hosts: - - api.example.com - secretName: app-tls - rules: - - host: api.example.com - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: app - port: - number: 8000 +```bash +# Basic ingress (HTTP) +uv run api-forge-cli deploy up k8s --ingress + +# Custom hostname with TLS +uv run api-forge-cli deploy up k8s --ingress --ingress-host api.example.com --ingress-tls-secret api-tls ``` +For comprehensive Ingress documentation including TLS setup, cloud provider configurations, and troubleshooting, see the **[Ingress Configuration Guide](./fastapi-kubernetes-ingress.md)**. + ### NetworkPolicies Restrict pod-to-pod communication: @@ -1350,6 +1333,7 @@ kubectl apply -f argocd-application.yaml ## Related Documentation +- [Ingress Configuration](./fastapi-kubernetes-ingress.md) - External access, TLS, and routing - [Docker Dev Environment](./fastapi-docker-dev-environment.md) - Local testing before deployment - [Docker Compose Production](./fastapi-production-deployment-docker-compose.md) - Alternative deployment - [Testing Strategy](./fastapi-testing-strategy.md) - Test before deploying diff --git a/docs/fastapi-kubernetes-ingress.md b/docs/fastapi-kubernetes-ingress.md new file mode 100644 index 0000000..6caa455 --- /dev/null +++ b/docs/fastapi-kubernetes-ingress.md @@ -0,0 +1,587 @@ +# FastAPI Kubernetes Ingress Configuration + +Learn how to expose your FastAPI application externally using Kubernetes Ingress with API Forge. This guide covers Ingress setup, CLI usage, TLS configuration, and cloud provider integrations. + +## Overview + +API Forge includes built-in Ingress support for exposing your FastAPI application to external traffic. The Ingress resource provides: + +- **Host-based routing** - Route traffic to your app based on hostname +- **TLS/HTTPS termination** - Secure connections with SSL certificates +- **Path-based routing** - Route different paths to different services +- **Load balancing** - Distribute traffic across pod replicas +- **Cloud provider integration** - Works with AWS ALB, GCP GCLB, Azure, and NGINX + +> **Note:** Ingress is for the **app service only**. Internal services (PostgreSQL, Redis, Temporal) remain as ClusterIP services and are not exposed externally. + +## Quick Start + +Enable Ingress with the CLI: + +```bash +# Enable ingress with default host (api.local) +uv run api-forge-cli deploy up k8s --ingress + +# Custom hostname +uv run api-forge-cli deploy up k8s --ingress --ingress-host api.example.com + +# With TLS (reference to a K8s TLS secret) +uv run api-forge-cli deploy up k8s --ingress --ingress-host api.example.com --ingress-tls-secret api-tls +``` + +## Prerequisites + +Before enabling Ingress, ensure you have: + +### 1. Ingress Controller Installed + +An Ingress Controller must be installed in your cluster. The Ingress resource is just configuration - the controller does the actual routing. + +**Minikube (local development):** +```bash +minikube addons enable ingress + +# Verify the controller is running +kubectl get pods -n ingress-nginx +``` + +**Production clusters (GKE, EKS, AKS):** +```bash +# Install NGINX Ingress Controller via Helm +helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx +helm repo update + +helm install ingress-nginx ingress-nginx/ingress-nginx \ + --namespace ingress-nginx \ + --create-namespace + +# Wait for the controller to be ready +kubectl wait --namespace ingress-nginx \ + --for=condition=ready pod \ + --selector=app.kubernetes.io/component=controller \ + --timeout=120s + +# Verify installation +kubectl get svc -n ingress-nginx +``` + +### 2. DNS Configuration (Production) + +For production deployments, configure DNS to point to your cluster: + +- **Minikube:** Add to `/etc/hosts`: `$(minikube ip) api.local` +- **Cloud providers:** Create a DNS A/CNAME record pointing to the Load Balancer IP + +## CLI Reference + +The `deploy up k8s` command supports three Ingress-related flags: + +| Flag | Default | Description | +|------|---------|-------------| +| `--ingress` | `false` | Enable Ingress for external access | +| `--ingress-host` | `api.local` | Hostname for the Ingress rule | +| `--ingress-tls-secret` | None | Kubernetes TLS secret name for HTTPS | + +### Examples + +**Basic Ingress (HTTP only):** +```bash +uv run api-forge-cli deploy up k8s --ingress +# Access at: http://api.local (requires /etc/hosts entry) +``` + +**Custom hostname:** +```bash +uv run api-forge-cli deploy up k8s --ingress --ingress-host myapi.example.com +# Access at: http://myapi.example.com +``` + +**HTTPS with existing TLS secret:** +```bash +uv run api-forge-cli deploy up k8s \ + --ingress \ + --ingress-host api.example.com \ + --ingress-tls-secret api-tls +# Access at: https://api.example.com +``` + +## How Ingress Routing Works + +Understanding the request flow helps troubleshoot issues: + +``` +Internet Request (https://api.example.com/docs) + │ + ▼ +DNS Lookup + │ api.example.com → 34.123.45.67 (Load Balancer IP) + ▼ +Cloud Load Balancer + │ Provisioned by Ingress Controller + ▼ +Ingress Controller Pods (nginx) + │ 1. Reads HTTP Host header from request + │ 2. Matches against Ingress rules + │ 3. Finds: host=api.example.com → service=app:8000 + ▼ +Kubernetes Service (app) + │ ClusterIP, port 8000 + ▼ +FastAPI Pod(s) + │ Your application + ▼ +Response flows back through the chain +``` + +**Key Points:** + +1. The browser automatically sets the `Host` header from the URL +2. NGINX Ingress matches this header against configured Ingress resources +3. Traffic is proxied to the backend Kubernetes Service +4. The Service load-balances across pod replicas + +## Configuration + +### Values.yaml Structure + +The Ingress configuration lives in `infra/helm/api-forge/values.yaml`: + +```yaml +app: + ingress: + enabled: false # Set to true or use --ingress flag + className: nginx # Ingress controller class + annotations: {} # Provider-specific annotations + hosts: + - host: api-forge.local + paths: + - path: / + pathType: Prefix + tls: [] # TLS configuration +``` + +When using CLI flags, they override these values: + +```bash +# CLI command: +uv run api-forge-cli deploy up k8s --ingress --ingress-host api.example.com --ingress-tls-secret api-tls + +# Equivalent values.yaml: +app: + ingress: + enabled: true + hosts: + - host: api.example.com + paths: + - path: / + pathType: Prefix + tls: + - secretName: api-tls + hosts: + - api.example.com +``` + +### Direct values.yaml Configuration + +For more complex setups, edit `values.yaml` directly: + +```yaml +app: + ingress: + enabled: true + className: nginx + annotations: + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/proxy-body-size: "10m" + hosts: + - host: api.example.com + paths: + - path: / + pathType: Prefix + - path: /api/v2 + pathType: Prefix + tls: + - secretName: api-tls + hosts: + - api.example.com +``` + +## TLS/HTTPS Configuration + +### Option 1: Manual TLS Secret + +Create a TLS secret from existing certificates: + +```bash +# Create TLS secret from certificate files +kubectl create secret tls api-tls \ + --cert=path/to/tls.crt \ + --key=path/to/tls.key \ + -n api-forge-prod + +# Deploy with TLS +uv run api-forge-cli deploy up k8s \ + --ingress \ + --ingress-host api.example.com \ + --ingress-tls-secret api-tls +``` + +### Option 2: Cert-Manager (Recommended for Production) + +Use [cert-manager](https://cert-manager.io/) for automatic Let's Encrypt certificates: + +**1. Install cert-manager:** +```bash +helm repo add jetstack https://charts.jetstack.io +helm repo update + +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --set installCRDs=true +``` + +**2. Create a ClusterIssuer:** +```yaml +# cluster-issuer.yaml +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-prod +spec: + acme: + email: your-email@example.com + server: https://acme-v02.api.letsencrypt.org/directory + privateKeySecretRef: + name: letsencrypt-prod-account-key + solvers: + - http01: + ingress: + class: nginx +``` + +```bash +kubectl apply -f cluster-issuer.yaml +``` + +**3. Configure Ingress in values.yaml:** +```yaml +app: + ingress: + enabled: true + className: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod # Triggers cert-manager + hosts: + - host: api.example.com + paths: + - path: / + pathType: Prefix + tls: + - secretName: api-tls # cert-manager creates this automatically + hosts: + - api.example.com +``` + +**How it works:** +1. Deploy the Ingress with the annotation and non-existent TLS secret +2. Cert-manager detects the `cert-manager.io/cluster-issuer` annotation +3. Cert-manager contacts Let's Encrypt, proves domain ownership via HTTP-01 challenge +4. Cert-manager creates the `api-tls` secret with the certificate +5. NGINX Ingress loads the certificate and serves HTTPS +6. Cert-manager auto-renews before expiry + +## Cloud Provider Examples + +### AWS ALB Ingress + +Use AWS Application Load Balancer: + +```yaml +app: + ingress: + enabled: true + className: alb + annotations: + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip + alb.ingress.kubernetes.io/certificate-arn: arn:aws:acm:us-east-1:123456789:certificate/abc-123 + alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]' + alb.ingress.kubernetes.io/ssl-redirect: '443' + hosts: + - host: api.example.com + paths: + - path: / + pathType: Prefix + tls: + - hosts: + - api.example.com +``` + +**Prerequisites:** +- AWS Load Balancer Controller installed +- ACM certificate created for your domain + +### GKE with Google Cloud Load Balancer + +```yaml +app: + ingress: + enabled: true + className: gce + annotations: + kubernetes.io/ingress.global-static-ip-name: api-ip + networking.gke.io/managed-certificates: api-cert + hosts: + - host: api.example.com + paths: + - path: / + pathType: Prefix +``` + +**Prerequisites:** +- Reserve a global static IP: `gcloud compute addresses create api-ip --global` +- Create a ManagedCertificate resource + +### Azure Application Gateway + +```yaml +app: + ingress: + enabled: true + className: azure/application-gateway + annotations: + appgw.ingress.kubernetes.io/ssl-redirect: "true" + appgw.ingress.kubernetes.io/use-private-ip: "false" + hosts: + - host: api.example.com + paths: + - path: / + pathType: Prefix + tls: + - secretName: api-tls + hosts: + - api.example.com +``` + +## Minikube Local Development + +### Setup + +```bash +# Enable ingress addon +minikube addons enable ingress + +# Get Minikube IP +minikube ip +# Example output: 192.168.49.2 + +# Add to /etc/hosts +echo "$(minikube ip) api.local" | sudo tee -a /etc/hosts +``` + +### Deploy and Access + +```bash +# Deploy with Ingress +uv run api-forge-cli deploy up k8s --ingress + +# Access your API +curl http://api.local/health +open http://api.local/docs +``` + +### Minikube Tunnel (Alternative) + +If you have DNS issues, use `minikube tunnel`: + +```bash +# In a separate terminal (requires sudo) +minikube tunnel + +# This exposes LoadBalancer services on localhost +# Access at http://localhost:8000 (if using LoadBalancer type) +``` + +## Why Use a Real Domain for Production + +For production deployments, use your actual domain (not `api.local`) because: + +| Feature | Why Real Domain Required | +|---------|-------------------------| +| **TLS/HTTPS** | Certificates are issued for specific domains. Mismatch causes TLS handshake failure. | +| **OAuth/OIDC** | Redirect URIs registered with providers (Google, Microsoft) must match exactly. | +| **Cookies** | Set with `domain=example.com`. Won't work if Host header differs. | +| **CORS** | Configured for specific origins. Mismatched Host breaks cross-origin requests. | +| **Browser Security** | Browsers set Host header automatically from URL - cannot be overridden. | + +**For local development:** `api.local` works because you control the hosts file and don't need real TLS/OAuth. + +## Service Types Comparison + +Understanding when to use Ingress vs other service types: + +| Type | Use Case | External Access | Example Services | +|------|----------|----------------|------------------| +| **ClusterIP** | Internal services only | No | PostgreSQL, Redis, Temporal | +| **NodePort** | Dev/testing, direct node access | Yes (nodeIP:port) | Debugging only | +| **LoadBalancer** | Single service exposure | Yes (external IP) | Simple apps without Ingress | +| **Ingress** | Production APIs, TLS, routing | Yes (hostname) | FastAPI app | + +**Recommendation:** Use Ingress for the app service. Keep PostgreSQL, Redis, and Temporal as ClusterIP (internal only) for security. + +## Troubleshooting + +### Ingress Not Accessible + +**1. Check Ingress Controller is running:** +```bash +kubectl get pods -n ingress-nginx +# Should show controller pods in Running state +``` + +**2. Check Ingress resource exists:** +```bash +kubectl get ingress -n api-forge-prod +kubectl describe ingress app -n api-forge-prod +``` + +**3. Check Ingress has an address:** +```bash +kubectl get ingress -n api-forge-prod -o wide +# ADDRESS column should show an IP or hostname +``` + +**4. Verify backend service exists:** +```bash +kubectl get svc app -n api-forge-prod +kubectl get endpoints app -n api-forge-prod +# Endpoints should list pod IPs +``` + +### 404 Not Found + +**Cause:** Host header doesn't match any Ingress rule. + +**Solution:** +```bash +# Check the configured host +kubectl describe ingress app -n api-forge-prod | grep Host + +# Ensure you're using the correct hostname +curl -H "Host: api.example.com" http:///health +``` + +### 502 Bad Gateway + +**Cause:** Backend pods not ready or service misconfiguration. + +**Solution:** +```bash +# Check pods are running +kubectl get pods -n api-forge-prod -l app.kubernetes.io/name=app + +# Check pod logs +kubectl logs -n api-forge-prod -l app.kubernetes.io/name=app + +# Verify service targets correct port +kubectl describe svc app -n api-forge-prod +``` + +### TLS Certificate Issues + +**1. Check TLS secret exists:** +```bash +kubectl get secret api-tls -n api-forge-prod +``` + +**2. If using cert-manager, check certificate status:** +```bash +kubectl get certificate -n api-forge-prod +kubectl describe certificate api-tls -n api-forge-prod +``` + +**3. Check cert-manager logs:** +```bash +kubectl logs -n cert-manager -l app.kubernetes.io/component=controller +``` + +### DNS Resolution Issues + +**1. Verify DNS resolves correctly:** +```bash +nslookup api.example.com +dig api.example.com +``` + +**2. For local development, check /etc/hosts:** +```bash +cat /etc/hosts | grep api.local +``` + +**3. Test with IP directly:** +```bash +# Get the Ingress IP +kubectl get ingress -n api-forge-prod -o jsonpath='{.items[0].status.loadBalancer.ingress[0].ip}' + +# Curl with Host header +curl -H "Host: api.example.com" http:///health +``` + +### Ingress Controller Logs + +```bash +# NGINX Ingress Controller logs +kubectl logs -n ingress-nginx -l app.kubernetes.io/component=controller --tail=100 + +# Look for errors related to your ingress +kubectl logs -n ingress-nginx -l app.kubernetes.io/component=controller | grep api-forge +``` + +## Tips and Best Practices + +1. **Start with HTTP** - Get routing working before adding TLS +2. **Use meaningful hostnames** - Match your actual domain structure +3. **Enable SSL redirect** - Redirect HTTP to HTTPS in production +4. **Set appropriate timeouts** - Configure proxy timeouts for long-running requests +5. **Limit request size** - Use `proxy-body-size` annotation for file uploads +6. **Monitor Ingress Controller** - Set up metrics and alerting +7. **Use separate Ingress per environment** - Different hosts for staging vs production + +### Common Annotations + +```yaml +annotations: + # SSL redirect (force HTTPS) + nginx.ingress.kubernetes.io/ssl-redirect: "true" + + # Request body size limit + nginx.ingress.kubernetes.io/proxy-body-size: "10m" + + # Timeouts for slow backends + nginx.ingress.kubernetes.io/proxy-read-timeout: "600" + nginx.ingress.kubernetes.io/proxy-send-timeout: "600" + + # WebSocket support + nginx.ingress.kubernetes.io/proxy-http-version: "1.1" + nginx.ingress.kubernetes.io/upstream-hash-by: "$http_upgrade" + + # Rate limiting + nginx.ingress.kubernetes.io/limit-rps: "10" + nginx.ingress.kubernetes.io/limit-connections: "5" +``` + +## Related Documentation + +- [Kubernetes Deployment Guide](./fastapi-kubernetes-deployment.md) - Complete K8s deployment guide +- [Docker Dev Environment](./fastapi-docker-dev-environment.md) - Local development setup +- [Secrets Management](./security/secrets_management.md) - Managing TLS secrets +- [Production Deployment](./fastapi-production-deployment-docker-compose.md) - Docker Compose alternative + +## Additional Resources + +- [NGINX Ingress Controller](https://kubernetes.github.io/ingress-nginx/) +- [cert-manager Documentation](https://cert-manager.io/docs/) +- [AWS Load Balancer Controller](https://kubernetes-sigs.github.io/aws-load-balancer-controller/) +- [GKE Ingress](https://cloud.google.com/kubernetes-engine/docs/concepts/ingress) +- [Kubernetes Ingress Concepts](https://kubernetes.io/docs/concepts/services-networking/ingress/) diff --git a/infra/helm/api-forge/templates/ingress.yaml b/infra/helm/api-forge/templates/ingress.yaml new file mode 100644 index 0000000..27c70de --- /dev/null +++ b/infra/helm/api-forge/templates/ingress.yaml @@ -0,0 +1,52 @@ +{{- if .Values.app.ingress.enabled -}} +{{/* +Ingress resource for exposing the API application externally. +Supports: +- Multiple hosts and paths +- TLS termination +- Ingress class selection (nginx, traefik, etc.) +- Provider-specific annotations (AWS ALB, GCP, etc.) +*/}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ .Values.app.name | default "app" }} + namespace: {{ .Values.global.namespace }} + labels: + app.kubernetes.io/name: {{ .Values.app.name | default "app" }} + app.kubernetes.io/component: ingress + {{- include "api-forge.labels" . | nindent 4 }} + {{- with .Values.app.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.app.ingress.className }} + ingressClassName: {{ .Values.app.ingress.className }} + {{- end }} + {{- if .Values.app.ingress.tls }} + tls: + {{- range .Values.app.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.app.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + pathType: {{ .pathType | default "Prefix" }} + backend: + service: + name: {{ $.Values.app.name | default "app" }} + port: + number: {{ $.Values.app.service.port | default 8000 }} + {{- end }} + {{- end }} +{{- end }} diff --git a/infra/helm/api-forge/values.yaml b/infra/helm/api-forge/values.yaml index 191c967..9621158 100644 --- a/infra/helm/api-forge/values.yaml +++ b/infra/helm/api-forge/values.yaml @@ -217,16 +217,92 @@ app: service: type: ClusterIP port: 8000 + # --------------------------------------------------------------------------- + # Ingress Configuration + # --------------------------------------------------------------------------- + # Exposes the API externally via an Ingress controller. + # + # Prerequisites: + # - An Ingress controller must be installed in the cluster + # - For Minikube: minikube addons enable ingress + # - For cloud: Use provider's ingress controller or install nginx-ingress + # + # Example configurations: + # + # Minikube (local development): + # ingress: + # enabled: true + # className: nginx + # hosts: + # - host: api.local + # paths: + # - path: / + # pathType: Prefix + # # Add to /etc/hosts: $(minikube ip) api.local + # + # AWS ALB: + # ingress: + # enabled: true + # className: alb + # annotations: + # alb.ingress.kubernetes.io/scheme: internet-facing + # alb.ingress.kubernetes.io/target-type: ip + # alb.ingress.kubernetes.io/certificate-arn: arn:aws:acm:... + # hosts: + # - host: api.example.com + # paths: + # - path: / + # pathType: Prefix + # tls: + # - hosts: + # - api.example.com + # + # GKE with Google Cloud Load Balancer: + # ingress: + # enabled: true + # className: gce + # annotations: + # kubernetes.io/ingress.global-static-ip-name: api-ip + # networking.gke.io/managed-certificates: api-cert + # hosts: + # - host: api.example.com + # paths: + # - path: / + # pathType: Prefix + # + # NGINX with cert-manager TLS: + # ingress: + # enabled: true + # className: nginx + # annotations: + # cert-manager.io/cluster-issuer: letsencrypt-prod + # nginx.ingress.kubernetes.io/ssl-redirect: "true" + # hosts: + # - host: api.example.com + # paths: + # - path: / + # pathType: Prefix + # tls: + # - secretName: api-tls + # hosts: + # - api.example.com + # --------------------------------------------------------------------------- ingress: enabled: false className: nginx annotations: {} + # nginx.ingress.kubernetes.io/ssl-redirect: "true" + # nginx.ingress.kubernetes.io/proxy-body-size: "10m" + # cert-manager.io/cluster-issuer: letsencrypt-prod hosts: - - host: api-forge.local - paths: - - path: / - pathType: Prefix + - host: api-forge.local + paths: + - path: / + pathType: Prefix tls: [] + # - secretName: api-forge-tls + # hosts: + # - api-forge.local # Horizontal Pod Autoscaler autoscaling: enabled: false diff --git a/src/cli/deploy_commands.py b/src/cli/deploy_commands.py index 8608ba7..2a76d83 100644 --- a/src/cli/deploy_commands.py +++ b/src/cli/deploy_commands.py @@ -57,6 +57,21 @@ def up( "-r", help="Container registry for remote k8s clusters (e.g., ghcr.io/myuser)", ), + ingress: bool = typer.Option( + False, + "--ingress/--no-ingress", + help="Enable Ingress for external access (k8s only)", + ), + ingress_host: str = typer.Option( + None, + "--ingress-host", + help="Ingress hostname (k8s only, e.g., api.example.com)", + ), + ingress_tls_secret: str = typer.Option( + None, + "--ingress-tls-secret", + help="TLS secret name for HTTPS (k8s only)", + ), ) -> None: """ 🚀 Deploy the application to the specified environment. @@ -101,6 +116,9 @@ def up( no_wait=no_wait, force_recreate=force_recreate, registry=registry, + ingress_enabled=ingress, + ingress_host=ingress_host, + ingress_tls_secret=ingress_tls_secret, ) except DeploymentError as e: diff --git a/src/cli/deployment/helm_deployer/deployer.py b/src/cli/deployment/helm_deployer/deployer.py index 879826d..c35f574 100644 --- a/src/cli/deployment/helm_deployer/deployer.py +++ b/src/cli/deployment/helm_deployer/deployer.py @@ -164,6 +164,9 @@ def deploy( self, namespace: str | None = None, registry: str | None = None, + ingress_enabled: bool = False, + ingress_host: str | None = None, + ingress_tls_secret: str | None = None, **kwargs: Any, ) -> None: """Deploy to Kubernetes cluster. @@ -182,6 +185,9 @@ def deploy( Args: namespace: Kubernetes namespace (default: api-forge-prod) registry: Container registry for remote clusters + ingress_enabled: Whether to enable Ingress for external access + ingress_host: Hostname for Ingress (e.g., api.example.com) + ingress_tls_secret: TLS secret name for HTTPS **kwargs: Reserved for future options """ if not self.check_env_file(): @@ -244,7 +250,11 @@ def deploy( # Phase 4: Deploy via Helm image_override_file = self.helm_release.create_image_override_file( - image_tag, registry + image_tag, + registry, + ingress_enabled=ingress_enabled, + ingress_host=ingress_host, + ingress_tls_secret=ingress_tls_secret, ) self.helm_release.deploy_release(namespace, image_override_file) diff --git a/src/cli/deployment/helm_deployer/helm_release.py b/src/cli/deployment/helm_deployer/helm_release.py index 76f2420..5a8e9b3 100644 --- a/src/cli/deployment/helm_deployer/helm_release.py +++ b/src/cli/deployment/helm_deployer/helm_release.py @@ -8,7 +8,7 @@ import tempfile from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import yaml # type: ignore[import-untyped] @@ -55,12 +55,18 @@ def create_image_override_file( self, image_tag: str, registry: str | None = None, + ingress_enabled: bool = False, + ingress_host: str | None = None, + ingress_tls_secret: str | None = None, ) -> Path: - """Create a temporary values file to override image tags. + """Create a temporary values file to override image tags and ingress. Args: image_tag: The unique image tag to use for all images registry: Optional container registry prefix for remote clusters + ingress_enabled: Whether to enable Ingress for external access + ingress_host: Hostname for Ingress (e.g., api.example.com) + ingress_tls_secret: TLS secret name for HTTPS Returns: Path to the temporary override file @@ -77,7 +83,7 @@ def create_image_override_file( redis_repo = self.constants.REDIS_IMAGE_NAME temporal_repo = self.constants.TEMPORAL_IMAGE_NAME - override_values = { + override_values: dict[str, Any] = { "app": {"image": {"repository": app_repo, "tag": image_tag}}, "worker": {"image": {"repository": app_repo, "tag": image_tag}}, # Use content-based tags for infra images to avoid stale image issues @@ -86,6 +92,29 @@ def create_image_override_file( "temporal": {"image": {"repository": temporal_repo, "tag": image_tag}}, } + # Add ingress configuration if enabled + if ingress_enabled: + ingress_config: dict[str, Any] = {"enabled": True} + + # Set hostname if provided + host = ingress_host or "api.local" + ingress_config["hosts"] = [ + {"host": host, "paths": [{"path": "/", "pathType": "Prefix"}]} + ] + + # Add TLS configuration if secret is provided + if ingress_tls_secret: + ingress_config["tls"] = [ + {"secretName": ingress_tls_secret, "hosts": [host]} + ] + + override_values["app"]["ingress"] = ingress_config + + self.console.print( + f"[bold cyan]🌐 Ingress enabled:[/bold cyan] {host}" + + (f" (TLS: {ingress_tls_secret})" if ingress_tls_secret else "") + ) + temp_file = Path(tempfile.mktemp(suffix=".yaml", prefix="helm-image-override-")) with open(temp_file, "w") as f: yaml.dump(override_values, f, default_flow_style=False) From a22c701570e13478334a4e974a4134c1880c2a04 Mon Sep 17 00:00:00 2001 From: jc Date: Wed, 3 Dec 2025 22:52:22 -0500 Subject: [PATCH 4/5] Adding ingress docs --- docs/fastapi-kubernetes-ingress.md | 146 ++++++++++++++++++++++++++--- 1 file changed, 135 insertions(+), 11 deletions(-) diff --git a/docs/fastapi-kubernetes-ingress.md b/docs/fastapi-kubernetes-ingress.md index 6caa455..d5f1497 100644 --- a/docs/fastapi-kubernetes-ingress.md +++ b/docs/fastapi-kubernetes-ingress.md @@ -228,9 +228,53 @@ uv run api-forge-cli deploy up k8s \ ### Option 2: Cert-Manager (Recommended for Production) -Use [cert-manager](https://cert-manager.io/) for automatic Let's Encrypt certificates: - -**1. Install cert-manager:** +Use [cert-manager](https://cert-manager.io/) for automatic Let's Encrypt certificates. + +> **Note:** The existing self-signed certificate mechanism (used for PostgreSQL, Redis) won't work for Ingress TLS. Browsers and external clients don't trust the internal CA, OAuth providers reject self-signed certificates, and there's no way to install the CA on all public clients. Use cert-manager with Let's Encrypt for production, or skip TLS for local development. + +#### Understanding Cert-Manager Architecture + +Cert-manager consists of **two parts**: + +1. **The Deployment** (pods that do the work) - Installed via Helm +2. **ClusterIssuer** (configuration) - Tells cert-manager HOW to obtain certificates + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ cert-manager namespace │ +│ ┌─────────────────────┐ │ +│ │ cert-manager pod │◄─── The actual controller (deployment) │ +│ │ (watches Ingresses) │ │ +│ └──────────┬──────────┘ │ +│ │ │ +│ │ Reads configuration from: │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ ClusterIssuer: letsencrypt-prod (cluster-scoped config) │ │ +│ │ - ACME server: Let's Encrypt │ │ +│ │ - Solver: HTTP-01 via nginx ingress │ │ +│ │ - Account key: stored in Secret │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────┘ + │ + │ When Ingress has annotation: + │ cert-manager.io/cluster-issuer: letsencrypt-prod + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ api-forge-prod namespace │ +│ ┌─────────────────────┐ ┌─────────────────────┐ │ +│ │ Ingress: app │────────▶│ Secret: api-tls │ │ +│ │ (your config) │ │ (created by │ │ +│ │ │ │ cert-manager) │ │ +│ └─────────────────────┘ └─────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +The `ClusterIssuer` is **not a deployment** - it's a Custom Resource (configuration object) that cert-manager reads. You create it once per cluster, and any Ingress in any namespace can reference it. + +#### Step-by-Step Setup + +**1. Install cert-manager (the deployment):** ```bash helm repo add jetstack https://charts.jetstack.io helm repo update @@ -239,21 +283,38 @@ helm install cert-manager jetstack/cert-manager \ --namespace cert-manager \ --create-namespace \ --set installCRDs=true + +# Wait for pods to be ready +kubectl wait --namespace cert-manager \ + --for=condition=ready pod \ + --selector=app.kubernetes.io/instance=cert-manager \ + --timeout=120s ``` -**2. Create a ClusterIssuer:** +This creates: +- `cert-manager` pod - the controller that obtains certificates +- `cert-manager-webhook` pod - validates custom resources +- `cert-manager-cainjector` pod - injects CA bundles +- Custom Resource Definitions (CRDs) for `ClusterIssuer`, `Certificate`, etc. + +**2. Create a ClusterIssuer (configuration only - no new pods):** ```yaml # cluster-issuer.yaml apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: name: letsencrypt-prod + # No namespace - ClusterIssuer is cluster-scoped spec: acme: + # Email for Let's Encrypt expiry notifications email: your-email@example.com + # Production ACME server server: https://acme-v02.api.letsencrypt.org/directory + # Secret to store your ACME account private key (created automatically) privateKeySecretRef: name: letsencrypt-prod-account-key + # How to prove domain ownership solvers: - http01: ingress: @@ -262,8 +323,15 @@ spec: ```bash kubectl apply -f cluster-issuer.yaml + +# Verify it's ready +kubectl get clusterissuer letsencrypt-prod +# NAME READY AGE +# letsencrypt-prod True 30s ``` +> **About `privateKeySecretRef`:** The `letsencrypt-prod-account-key` secret is created automatically by cert-manager on first use. It stores your Let's Encrypt account private key - you don't create it manually. + **3. Configure Ingress in values.yaml:** ```yaml app: @@ -283,13 +351,69 @@ app: - api.example.com ``` -**How it works:** -1. Deploy the Ingress with the annotation and non-existent TLS secret -2. Cert-manager detects the `cert-manager.io/cluster-issuer` annotation -3. Cert-manager contacts Let's Encrypt, proves domain ownership via HTTP-01 challenge -4. Cert-manager creates the `api-tls` secret with the certificate -5. NGINX Ingress loads the certificate and serves HTTPS -6. Cert-manager auto-renews before expiry +#### Certificate Issuance Flow + +When you deploy with the above configuration: + +1. **Ingress created** with `cert-manager.io/cluster-issuer` annotation +2. **Cert-manager detects** the annotation (it watches all Ingresses) +3. **Cert-manager creates** a temporary Ingress for the HTTP-01 challenge +4. **Let's Encrypt calls** `http://api.example.com/.well-known/acme-challenge/` +5. **Cert-manager responds** with proof of domain ownership +6. **Let's Encrypt issues** the certificate +7. **Cert-manager stores** it in the `api-tls` Secret +8. **NGINX Ingress loads** the certificate and serves HTTPS +9. **Cert-manager auto-renews** before expiry (default: 30 days before) + +#### Verify Certificate Status + +```bash +# Check ClusterIssuer is ready +kubectl get clusterissuer letsencrypt-prod + +# Check Certificate resource (created automatically from Ingress) +kubectl get certificate -n api-forge-prod +kubectl describe certificate -n api-forge-prod + +# Check the TLS secret was created +kubectl get secret api-tls -n api-forge-prod + +# If issues, check cert-manager logs +kubectl logs -n cert-manager -l app.kubernetes.io/component=controller --tail=100 +``` + +#### ClusterIssuer vs Issuer + +| Type | Scope | Use Case | +|------|-------|----------| +| `ClusterIssuer` | Cluster-wide | One issuer, all namespaces can use it | +| `Issuer` | Single namespace | Per-namespace control, multi-tenant clusters | + +For most setups, `ClusterIssuer` is simpler - create once, reference from any namespace. + +#### Staging vs Production + +Let's Encrypt has [rate limits](https://letsencrypt.org/docs/rate-limits/). Use staging for testing: + +```yaml +# cluster-issuer-staging.yaml +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-staging +spec: + acme: + email: your-email@example.com + server: https://acme-staging-v02.api.letsencrypt.org/directory # Staging server + privateKeySecretRef: + name: letsencrypt-staging-account-key + solvers: + - http01: + ingress: + class: nginx +``` + +Staging certificates are **not trusted by browsers** but let you test the full flow without hitting rate limits. ## Cloud Provider Examples From e26d5f4a210f54aca855dd73be093b4c7496597b Mon Sep 17 00:00:00 2001 From: jc Date: Wed, 3 Dec 2025 23:16:39 -0500 Subject: [PATCH 5/5] checkpoint --- docs/fastapi-flyio-kubernetes.md | 309 +++++++++++++++++++++++++++++++ 1 file changed, 309 insertions(+) create mode 100644 docs/fastapi-flyio-kubernetes.md diff --git a/docs/fastapi-flyio-kubernetes.md b/docs/fastapi-flyio-kubernetes.md new file mode 100644 index 0000000..c42d181 --- /dev/null +++ b/docs/fastapi-flyio-kubernetes.md @@ -0,0 +1,309 @@ +# FastAPI Fly.io Kubernetes (FKS) Deployment + +This document covers deploying API Forge to Fly.io's Kubernetes Service (FKS), including compatibility analysis with our existing Helm-based deployment and future CLI design considerations. + +## Overview + +Fly.io Kubernetes Service (FKS) is Fly.io's managed Kubernetes offering that integrates with their global Anycast network. It provides a different deployment model than traditional Kubernetes clusters (GKE, EKS, AKS), with some unique advantages and constraints. + +> **Status:** FKS support is planned but not yet implemented. This document captures research and design decisions for future development. + +## FKS Key Characteristics + +### How FKS Differs from Standard Kubernetes + +| Aspect | Standard K8s (GKE, EKS, AKS) | Fly.io FKS | +|--------|------------------------------|------------| +| **Ingress** | Ingress resource + Ingress Controller (nginx) | Not used - Fly's Anycast proxy handles routing | +| **LoadBalancer** | Cloud provider provisions external IP | Maps directly to Fly.io's edge network | +| **TLS Certificates** | cert-manager + Let's Encrypt | Automatic - Fly provisions and renews certs | +| **DNS** | Manual - point domain to LB IP | Automatic `*.fly.dev` + custom domain support | +| **Global Distribution** | Multi-region requires complex setup | Built-in - deploy to 30+ regions easily | +| **Scaling** | HPA, node autoscaling | Fly Machines with automatic scaling | + +### What FKS Provides Automatically + +1. **Automatic TLS** - No cert-manager, no Let's Encrypt configuration needed +2. **Global Anycast** - Traffic routed to nearest region automatically +3. **DDoS Protection** - Built into Fly's edge network +4. **Automatic DNS** - `yourapp.fly.dev` domains provisioned automatically +5. **Custom Domains** - Simple CNAME setup with automatic certificate issuance + +### What FKS Does NOT Use + +- ❌ Ingress resources +- ❌ Ingress Controllers (nginx, traefik, etc.) +- ❌ cert-manager +- ❌ External DNS controllers +- ❌ Cloud provider load balancer integrations + +## Exposing Services on FKS + +### LoadBalancer Service (Recommended) + +On FKS, you expose services using `type: LoadBalancer` which Fly.io intercepts: + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: app + annotations: + # Fly-specific annotations + fly.io/app: my-fastapi-app +spec: + type: LoadBalancer + ports: + - port: 443 + targetPort: 8000 + selector: + app.kubernetes.io/name: app +``` + +Fly.io then: +1. Creates a Fly App if it doesn't exist +2. Provisions `my-fastapi-app.fly.dev` domain +3. Issues TLS certificate automatically +4. Routes global traffic through Anycast + +### Custom Domains + +```yaml +metadata: + annotations: + fly.io/app: my-fastapi-app + fly.io/domains: "api.example.com,api.mycompany.io" +``` + +Then add a CNAME record: +``` +api.example.com CNAME my-fastapi-app.fly.dev +``` + +Fly automatically issues certificates for custom domains. + +## Compatibility Analysis + +### Current API Forge Features vs FKS + +| API Forge Feature | Standard K8s | FKS Compatibility | +|-------------------|--------------|-------------------| +| Helm chart deployment | ✅ Works | ✅ Works (FKS is standard K8s) | +| PostgreSQL StatefulSet | ✅ Works | ⚠️ Consider Fly Postgres instead | +| Redis Deployment | ✅ Works | ⚠️ Consider Fly Redis (Upstash) | +| Temporal | ✅ Works | ✅ Works | +| `--ingress` flag | Creates Ingress | ❌ Should create LoadBalancer instead | +| `--ingress-host` | Sets Ingress host | Should set Fly app name | +| `--ingress-tls-secret` | References K8s Secret | ❌ Not needed | +| `--ingress-tls auto` | Uses cert-manager | ❌ Not needed | +| NetworkPolicies | ✅ Works | ✅ Works | +| PersistentVolumeClaims | ✅ Works | ✅ Works (Fly Volumes) | + +### Components That Need Adaptation + +1. **Ingress → LoadBalancer Service** + - Replace Ingress resource with LoadBalancer Service + - Add Fly-specific annotations + - Remove Ingress Controller dependency + +2. **TLS Configuration** + - Remove cert-manager setup + - Remove TLS secret references + - Fly handles all certificate management + +3. **Database Considerations** + - Could use in-cluster PostgreSQL (works but not recommended) + - Better: Use Fly Postgres (managed, with replicas) + - Fly Postgres uses their own clustering solution + +4. **Redis Considerations** + - Could use in-cluster Redis (works) + - Alternative: Upstash Redis (Fly partnership, serverless) + +## Proposed CLI Design + +### Option A: Unified Command with Detection (Complex) + +```bash +# CLI detects cluster type and adapts +uv run api-forge-cli deploy up k8s --ingress --ingress-host myapp + +# On FKS: Creates LoadBalancer Service with Fly annotations +# On standard K8s: Creates Ingress + optional cert-manager +``` + +**Pros:** Single command, automatic adaptation +**Cons:** Complex logic, harder to debug, surprises users + +### Option B: Separate Target (Recommended) + +```bash +# Explicit Fly.io target +uv run api-forge-cli deploy up fly --app myapp --region ord + +# Standard Kubernetes +uv run api-forge-cli deploy up k8s --ingress --ingress-host api.example.com +``` + +**Pros:** Clear intent, simpler implementation, Fly-specific optimizations +**Cons:** Another target to maintain + +### Recommended: Option B + +Separate targets are better because: + +1. **Fly has unique features** - Machines, regions, Fly Postgres are Fly-specific +2. **Simpler Helm chart** - No conditionals for "is this FKS?" +3. **Better UX** - Users explicitly choose their target +4. **Fly CLI integration** - Can leverage `flyctl` where appropriate +5. **Different defaults** - FKS might skip in-cluster Postgres entirely + +### Proposed CLI Commands + +```bash +# Setup Fly.io (one-time) +uv run api-forge-cli deploy setup fly +# - Verifies flyctl is installed +# - Authenticates with Fly.io +# - Creates Fly organization if needed + +# Deploy to Fly Kubernetes +uv run api-forge-cli deploy up fly \ + --app my-fastapi-app \ + --region ord \ + --postgres fly # Use Fly Postgres (recommended) + --redis upstash # Use Upstash Redis (optional) + +# Or with in-cluster databases (not recommended for production) +uv run api-forge-cli deploy up fly \ + --app my-fastapi-app \ + --postgres in-cluster \ + --redis in-cluster + +# Status +uv run api-forge-cli deploy status fly --app my-fastapi-app + +# Teardown +uv run api-forge-cli deploy down fly --app my-fastapi-app +``` + +### Implementation Phases + +**Phase 1: Basic FKS Support** +- Deploy app and worker to FKS +- Use LoadBalancer Service for external access +- In-cluster PostgreSQL and Redis (same as standard K8s) + +**Phase 2: Fly-Native Services** +- Fly Postgres integration +- Upstash Redis integration +- Fly Volumes for persistent storage + +**Phase 3: Advanced Features** +- Multi-region deployment +- Fly Machines autoscaling +- Fly.io metrics integration + +## Helm Chart Modifications for FKS + +### Conditional Ingress vs LoadBalancer + +```yaml +# values.yaml +app: + # Standard K8s ingress (existing) + ingress: + enabled: false + # ... existing config + + # Fly.io specific (new) + fly: + enabled: false + app: "" + regions: ["ord"] + domains: [] +``` + +### FKS-Specific Service Template + +```yaml +# templates/services/app-fly.yaml +{{- if .Values.app.fly.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ .Values.app.name | default "app" }} + namespace: {{ .Values.global.namespace }} + annotations: + fly.io/app: {{ .Values.app.fly.app | required "app.fly.app is required" }} + {{- if .Values.app.fly.domains }} + fly.io/domains: {{ .Values.app.fly.domains | join "," | quote }} + {{- end }} + labels: + {{- include "api-forge.labels" . | nindent 4 }} +spec: + type: LoadBalancer + ports: + - port: 443 + targetPort: {{ .Values.app.service.port | default 8000 }} + name: https + selector: + app.kubernetes.io/name: {{ .Values.app.name | default "app" }} +{{- end }} +``` + +## TLS Strategy by Platform + +| Platform | TLS Strategy | CLI Flag | +|----------|--------------|----------| +| **Minikube** | None (HTTP) or self-signed | `--ingress` (no TLS) | +| **Standard K8s** | cert-manager + Let's Encrypt | `--ingress --ingress-tls auto` | +| **Standard K8s** | Manual certificate | `--ingress --ingress-tls-secret name` | +| **AWS EKS** | ACM certificate | `--ingress` + ACM annotation | +| **GKE** | Google-managed cert | `--ingress` + ManagedCertificate | +| **Fly.io FKS** | Automatic (Fly-managed) | `deploy up fly` (TLS automatic) | + +## Database Strategy by Platform + +| Platform | Recommended PostgreSQL | Recommended Redis | +|----------|----------------------|-------------------| +| **Development** | Docker Compose (local) | Docker Compose (local) | +| **Minikube** | In-cluster StatefulSet | In-cluster Deployment | +| **Standard K8s** | In-cluster or managed (RDS, Cloud SQL) | In-cluster or managed | +| **Fly.io FKS** | Fly Postgres (managed) | Upstash Redis or in-cluster | + +## Migration Path + +### From Docker Compose to FKS + +1. **Test locally** with `deploy up dev` +2. **Test on Minikube** with `deploy up k8s` +3. **Deploy to FKS** with `deploy up fly` + +### From Standard K8s to FKS + +1. **Export data** from existing PostgreSQL +2. **Create Fly Postgres** cluster +3. **Import data** to Fly Postgres +4. **Deploy app** to FKS with `--postgres fly` +5. **Update DNS** to point to Fly + +## Current Limitations + +1. **FKS is relatively new** - Some features may change +2. **Fly Postgres clustering** - Different from standard PostgreSQL HA +3. **Temporal on Fly** - May need special consideration for workflows +4. **Cost model** - Fly charges differently than traditional cloud + +## Related Documentation + +- [Kubernetes Deployment Guide](./fastapi-kubernetes-deployment.md) - Standard K8s deployment +- [Ingress Configuration](./fastapi-kubernetes-ingress.md) - Ingress and TLS for standard K8s +- [Docker Dev Environment](./fastapi-docker-dev-environment.md) - Local development + +## External Resources + +- [Fly.io Kubernetes Documentation](https://fly.io/docs/kubernetes/) +- [Fly.io FKS Quickstart](https://fly.io/docs/kubernetes/fks-quickstart/) +- [Fly Postgres](https://fly.io/docs/postgres/) +- [Upstash Redis on Fly](https://fly.io/docs/reference/redis/)