From 8e09e33acbf0aded6ea0014c9f88f6723702e27d Mon Sep 17 00:00:00 2001
From: jc <jc>
Date: Wed, 3 Dec 2025 20:18:30 -0500
Subject: [PATCH 1/5] improvements to error detection logic

---
 infra/helm/api-forge/values.yaml              |   4 +-
 scripts/test_copier_generation.sh             |   2 +-
 src/cli/deployment/helm_deployer/validator.py | 143 ++++++++++++++----
 src/cli/deployment/shell_commands/kubectl.py  |  18 ++-
 tests/unit/cli/deployment/test_validator.py   | 102 ++++++++++++-
 5 files changed, 226 insertions(+), 43 deletions(-)
diff --git a/infra/helm/api-forge/values.yaml b/infra/helm/api-forge/values.yaml
index 8ce2624..191c967 100644
--- a/infra/helm/api-forge/values.yaml
+++ b/infra/helm/api-forge/values.yaml
@@ -80,7 +80,7 @@ postgres:
 # Redis Cache/Sessions
 # =============================================================================
 redis:
-  enabled: false
+  enabled: true
   image:
     repository: app_data_redis_image
     tag: latest
@@ -122,7 +122,7 @@ redis:
 # Temporal Workflow Engine
 # =============================================================================
 temporal:
-  enabled: false
+  enabled: true
   image:
     repository: my-temporal-server
     tag: latest
diff --git a/scripts/test_copier_generation.sh b/scripts/test_copier_generation.sh
index dc6321f..ee0ce46 100755
--- a/scripts/test_copier_generation.sh
+++ b/scripts/test_copier_generation.sh
@@ -67,7 +67,7 @@ log_info "Output directory: $TEST_OUTPUT_DIR"
 # Use --vcs-ref HEAD to use the current state (including uncommitted changes)
 # Use --data to override specific answers for testing optional features
 copier copy --trust --defaults --vcs-ref HEAD \
-    --data use_temporal=true --data use_redis=true \
+    --data use_temporal=false --data use_redis=false \
     "$TEMPLATE_DIR" "$TEST_OUTPUT_DIR"
 
 if [ $? -eq 0 ]; then
diff --git a/src/cli/deployment/helm_deployer/validator.py b/src/cli/deployment/helm_deployer/validator.py
index e6eed31..d9040f8 100644
--- a/src/cli/deployment/helm_deployer/validator.py
+++ b/src/cli/deployment/helm_deployer/validator.py
@@ -182,11 +182,13 @@ def prompt_cleanup(self, result: ValidationResult, namespace: str) -> bool:
                 "[yellow]Recommended: Run the following command to clean up:[/yellow]"
             )
             self.console.print(
-                "[bold cyan]  uv run api-forge-cli deploy down k8s --volumes[/bold cyan]\n"
+                "[bold cyan]  uv run api-forge-cli deploy down k8s[/bold cyan]\n"
             )
             self.console.print(
-                "[dim]This will delete all resources and persistent volumes, "
-                "allowing a fresh deployment.[/dim]\n"
+                "[dim]This will delete the Helm release and allow a fresh deployment.[/dim]"
+            )
+            self.console.print(
+                "[dim]Add --volumes only if you need to wipe persistent data (databases, etc).[/dim]\n"
             )
 
             # Prompt user
@@ -203,11 +205,8 @@ def prompt_cleanup(self, result: ValidationResult, namespace: str) -> bool:
             self.console.print(
                 "[bold yellow]Errors detected that may cause deployment issues.[/bold yellow]\n"
             )
-            self.console.print("[yellow]Consider running cleanup first:[/yellow]")
-            self.console.print(
-                "[bold cyan]  uv run api-forge-cli deploy down k8s --volumes[/bold cyan]\n"
-            )
 
+            # Prompt user
             try:
                 response = (
                     input("Proceed with deployment anyway? [y/N]: ").strip().lower()
@@ -290,25 +289,46 @@ def _has_helm_release(self, namespace: str) -> bool:
         return any(r.name == self.constants.HELM_RELEASE_NAME for r in releases)
 
     def _check_failed_jobs(self, namespace: str, result: ValidationResult) -> None:
-        """Check for failed jobs in the namespace."""
+        """Check for failed jobs in the namespace.
+
+        Only flags jobs that have actually failed (exhausted retries with no
+        success). Jobs that are still running or have completed successfully
+        are not flagged, even if they had previous failed attempts.
+
+        Failed jobs are flagged as warnings since Kubernetes will often retry
+        them, and they may succeed on subsequent attempts as dependencies
+        come online.
+        """
         jobs = self.commands.kubectl.get_jobs(namespace)
 
         for job in jobs:
-            if job.get("status") == "Failed":
+            job_name = job["name"]
+            job_status = job.get("status")
+
+            # If job succeeded, it's fine - ignore any previous failures
+            if job_status == "Complete":
+                continue
+
+            # If job is still running, don't flag it
+            if job_status == "Running":
+                continue
+
+            # Job failed - flag as warning (may be transient)
+            if job_status == "Failed":
                 result.issues.append(
                     ValidationIssue(
-                        severity=ValidationSeverity.ERROR,
-                        title=f"Failed job: {job['name']}",
+                        severity=ValidationSeverity.WARNING,
+                        title=f"Job has failures: {job_name}",
                         description=(
-                            f"Job '{job['name']}' failed. This may indicate "
-                            "initialization or configuration problems."
+                            f"Job '{job_name}' has failed attempts. This may be "
+                            "transient during startup while dependencies initialize."
                         ),
                         recovery_hint=(
-                            "Delete the failed job and redeploy, or run full cleanup "
-                            "with 'deploy down k8s --volumes'"
+                            f"Check logs: 'kubectl logs job/{job_name} -n {namespace}'. "
+                            f"Delete job to retry: 'kubectl delete job {job_name} -n {namespace}'"
                         ),
                         resource_type="Job",
-                        resource_name=job["name"],
+                        resource_name=job_name,
                     )
                 )
 
@@ -318,21 +338,22 @@ def _check_crashloop_pods(self, namespace: str, result: ValidationResult) -> Non
 
         for pod in pods:
             if pod.get("status") == "CrashLoopBackOff":
+                pod_name = str(pod["name"])
                 restarts = pod.get("restarts", 0)
                 result.issues.append(
                     ValidationIssue(
                         severity=ValidationSeverity.ERROR,
-                        title=f"Pod in CrashLoopBackOff: {pod['name']}",
+                        title=f"Pod in CrashLoopBackOff: {pod_name}",
                         description=(
-                            f"Pod '{pod['name']}' is crash-looping ({restarts} restarts). "
+                            f"Pod '{pod_name}' is crash-looping ({restarts} restarts). "
                             "This usually indicates configuration or dependency issues."
                         ),
                         recovery_hint=(
-                            "Check pod logs with 'kubectl logs {name} -n {namespace}', "
+                            f"Check pod logs with 'kubectl logs {pod_name} -n {namespace}', "
                             "then fix the issue or run cleanup"
-                        ).format(name=pod["name"], namespace=namespace),
+                        ),
                         resource_type="Pod",
-                        resource_name=pod["name"],
+                        resource_name=pod_name,
                     )
                 )
 
@@ -342,44 +363,102 @@ def _check_pending_pods(self, namespace: str, result: ValidationResult) -> None:
 
         for pod in pods:
             if pod.get("status") == "Pending":
+                pod_name = str(pod["name"])
                 # Check if it's been pending for a while (ignore recently created)
                 # For now, treat all Pending as warnings
                 result.issues.append(
                     ValidationIssue(
                         severity=ValidationSeverity.WARNING,
-                        title=f"Pod pending: {pod['name']}",
+                        title=f"Pod pending: {pod_name}",
                         description=(
-                            f"Pod '{pod['name']}' is stuck in Pending state. "
+                            f"Pod '{pod_name}' is stuck in Pending state. "
                             "This may indicate resource constraints or scheduling issues."
                         ),
                         recovery_hint=(
-                            "Check events with 'kubectl describe pod {name} -n {namespace}'"
-                        ).format(name=pod["name"], namespace=namespace),
+                            f"Check events with 'kubectl describe pod {pod_name} -n {namespace}'"
+                        ),
                         resource_type="Pod",
-                        resource_name=pod["name"],
+                        resource_name=pod_name,
                     )
                 )
 
     def _check_error_pods(self, namespace: str, result: ValidationResult) -> None:
-        """Check for pods in Error state."""
+        """Check for pods in Error state.
+
+        For pods owned by Jobs, only considers the most recent pod per job.
+        This avoids flagging old failed attempts when the job has since
+        succeeded or has a newer attempt in progress.
+        """
         pods = self.commands.kubectl.get_pods(namespace)
 
+        # Group job-owned pods by their job name
+        job_pods: dict[str, list[dict[str, str | int]]] = {}
+        non_job_pods: list[dict[str, str | int]] = []
+
         for pod in pods:
+            job_owner = str(pod.get("jobOwner", ""))
+            if job_owner:
+                if job_owner not in job_pods:
+                    job_pods[job_owner] = []
+                job_pods[job_owner].append(pod)
+            else:
+                non_job_pods.append(pod)
+
+        # Check non-job pods for errors (these are always relevant)
+        for pod in non_job_pods:
             if pod.get("status") == "Error":
+                pod_name = str(pod["name"])
                 result.issues.append(
                     ValidationIssue(
                         severity=ValidationSeverity.ERROR,
-                        title=f"Pod in Error state: {pod['name']}",
+                        title=f"Pod in Error state: {pod_name}",
                         description=(
-                            f"Pod '{pod['name']}' is in Error state. "
+                            f"Pod '{pod_name}' is in Error state. "
                             "Check logs to determine the cause."
                         ),
                         recovery_hint=(
-                            "Check pod logs with 'kubectl logs {name} -n {namespace}', "
+                            f"Check pod logs with 'kubectl logs {pod_name} -n {namespace}', "
                             "then fix the issue or run cleanup"
-                        ).format(name=pod["name"], namespace=namespace),
+                        ),
+                        resource_type="Pod",
+                        resource_name=pod_name,
+                    )
+                )
+
+        # For job-owned pods, only check the most recent pod per job
+        for job_name, pods_list in job_pods.items():
+            # Sort by creation timestamp (newest first)
+            # ISO 8601 timestamps sort correctly as strings
+            sorted_pods = sorted(
+                pods_list,
+                key=lambda p: str(p.get("creationTimestamp", "")),
+                reverse=True,
+            )
+
+            if not sorted_pods:
+                continue
+
+            most_recent_pod = sorted_pods[0]
+            pod_status = most_recent_pod.get("status")
+
+            # Only flag if the most recent pod is in Error state
+            # Completed/Succeeded pods are fine, older failed pods are irrelevant
+            if pod_status == "Error":
+                pod_name = str(most_recent_pod["name"])
+                result.issues.append(
+                    ValidationIssue(
+                        severity=ValidationSeverity.WARNING,
+                        title=f"Job pod in Error state: {pod_name}",
+                        description=(
+                            f"Most recent pod for job '{job_name}' is in Error state. "
+                            "This may be transient if the job will retry."
+                        ),
+                        recovery_hint=(
+                            f"Check logs: 'kubectl logs {pod_name} -n {namespace}'. "
+                            f"Delete job to retry: 'kubectl delete job {job_name} -n {namespace}'"
+                        ),
                         resource_type="Pod",
-                        resource_name=pod["name"],
+                        resource_name=pod_name,
                     )
                 )
 
diff --git a/src/cli/deployment/shell_commands/kubectl.py b/src/cli/deployment/shell_commands/kubectl.py
index dbe21fd..575e3d1 100644
--- a/src/cli/deployment/shell_commands/kubectl.py
+++ b/src/cli/deployment/shell_commands/kubectl.py
@@ -486,14 +486,15 @@ def wait_for_pods(
             capture_output=False,
         )
 
-    def get_pods(self, namespace: str) -> list[dict[str, str]]:
+    def get_pods(self, namespace: str) -> list[dict[str, str | int]]:
         """Get all pods in a namespace with their status.
 
         Args:
             namespace: Kubernetes namespace
 
         Returns:
-            List of dicts with pod name, status, and restarts
+            List of dicts with pod name, status, restarts, creation timestamp,
+            and job owner (if pod is owned by a Job)
         """
         result = self._runner.run(
             ["kubectl", "get", "pods", "-n", namespace, "-o", "json"],
@@ -507,9 +508,18 @@ def get_pods(self, namespace: str) -> list[dict[str, str]]:
             pods = []
 
             for pod in data.get("items", []):
-                name = pod.get("metadata", {}).get("name", "")
+                metadata = pod.get("metadata", {})
+                name = metadata.get("name", "")
+                creation_timestamp = metadata.get("creationTimestamp", "")
                 status = pod.get("status", {})
 
+                # Check if pod is owned by a Job
+                job_owner = ""
+                for owner_ref in metadata.get("ownerReferences", []):
+                    if owner_ref.get("kind") == "Job":
+                        job_owner = owner_ref.get("name", "")
+                        break
+
                 # Determine pod status
                 phase = status.get("phase", "Unknown")
                 container_statuses = status.get("containerStatuses", [])
@@ -535,6 +545,8 @@ def get_pods(self, namespace: str) -> list[dict[str, str]]:
                         "name": name,
                         "status": pod_status,
                         "restarts": restarts,
+                        "creationTimestamp": creation_timestamp,
+                        "jobOwner": job_owner,
                     }
                 )
 
diff --git a/tests/unit/cli/deployment/test_validator.py b/tests/unit/cli/deployment/test_validator.py
index b92608f..6dbf336 100644
--- a/tests/unit/cli/deployment/test_validator.py
+++ b/tests/unit/cli/deployment/test_validator.py
@@ -160,7 +160,11 @@ def test_validate_existing_namespace_no_issues(
     def test_validate_detects_failed_jobs(
         self, validator: DeploymentValidator, mock_commands: MagicMock
     ) -> None:
-        """Validation should detect failed Kubernetes jobs."""
+        """Validation should detect failed Kubernetes jobs.
+
+        Init jobs like postgres-verifier are expected to have transient failures
+        during startup, so they should be flagged as warnings, not errors.
+        """
         mock_commands.kubectl.namespace_exists.return_value = True
         mock_commands.helm.list_releases.return_value = []
         mock_commands.kubectl.get_jobs.return_value = [
@@ -172,9 +176,30 @@ def test_validate_detects_failed_jobs(
 
         assert result.is_clean is False
         assert len(result.issues) == 1
-        assert result.issues[0].severity == ValidationSeverity.ERROR
+        # Init jobs like postgres-verifier are expected to have transient failures
+        # during startup, so they should be WARNING not ERROR
+        assert result.issues[0].severity == ValidationSeverity.WARNING
         assert "postgres-verifier" in result.issues[0].title
 
+    def test_validate_any_failed_job_is_warning(
+        self, validator: DeploymentValidator, mock_commands: MagicMock
+    ) -> None:
+        """All failed jobs should be flagged as warnings (may be transient)."""
+        mock_commands.kubectl.namespace_exists.return_value = True
+        mock_commands.helm.list_releases.return_value = []
+        mock_commands.kubectl.get_jobs.return_value = [
+            {"name": "migration-job", "status": "Failed"},
+        ]
+        mock_commands.kubectl.get_pods.return_value = []
+
+        result = validator.validate("api-forge-prod")
+
+        assert result.is_clean is False
+        assert len(result.issues) == 1
+        # All failed jobs are warnings since they may be transient
+        assert result.issues[0].severity == ValidationSeverity.WARNING
+        assert "migration-job" in result.issues[0].title
+
     def test_validate_detects_crashloop_pods(
         self, validator: DeploymentValidator, mock_commands: MagicMock
     ) -> None:
@@ -231,6 +256,72 @@ def test_validate_detects_error_pods(
         assert result.issues[0].severity == ValidationSeverity.ERROR
         assert "Error" in result.issues[0].title
 
+    def test_validate_job_pods_only_checks_most_recent(
+        self, validator: DeploymentValidator, mock_commands: MagicMock
+    ) -> None:
+        """For job-owned pods, only the most recent pod should be checked.
+
+        If old pods from a job are in Error state but a newer pod succeeded,
+        we should not flag the old errors.
+        """
+        mock_commands.kubectl.namespace_exists.return_value = True
+        mock_commands.helm.list_releases.return_value = []
+        mock_commands.kubectl.get_jobs.return_value = []
+        mock_commands.kubectl.get_pods.return_value = [
+            # Old pod from first attempt - failed
+            {
+                "name": "postgres-verifier-abc",
+                "status": "Error",
+                "jobOwner": "postgres-verifier",
+                "creationTimestamp": "2025-01-01T10:00:00Z",
+            },
+            # Newer pod from second attempt - succeeded
+            {
+                "name": "postgres-verifier-def",
+                "status": "Succeeded",
+                "jobOwner": "postgres-verifier",
+                "creationTimestamp": "2025-01-01T10:05:00Z",
+            },
+        ]
+
+        result = validator.validate("api-forge-prod")
+
+        # Should be clean - the most recent pod succeeded
+        assert result.is_clean is True
+        assert len(result.issues) == 0
+
+    def test_validate_job_pods_flags_if_most_recent_failed(
+        self, validator: DeploymentValidator, mock_commands: MagicMock
+    ) -> None:
+        """If the most recent job pod is in Error state, flag it as a warning."""
+        mock_commands.kubectl.namespace_exists.return_value = True
+        mock_commands.helm.list_releases.return_value = []
+        mock_commands.kubectl.get_jobs.return_value = []
+        mock_commands.kubectl.get_pods.return_value = [
+            # Old pod succeeded
+            {
+                "name": "postgres-verifier-abc",
+                "status": "Succeeded",
+                "jobOwner": "postgres-verifier",
+                "creationTimestamp": "2025-01-01T10:00:00Z",
+            },
+            # Newer pod failed
+            {
+                "name": "postgres-verifier-def",
+                "status": "Error",
+                "jobOwner": "postgres-verifier",
+                "creationTimestamp": "2025-01-01T10:05:00Z",
+            },
+        ]
+
+        result = validator.validate("api-forge-prod")
+
+        # Should flag the most recent failed pod as a warning
+        assert result.is_clean is False
+        assert len(result.issues) == 1
+        assert result.issues[0].severity == ValidationSeverity.WARNING
+        assert "postgres-verifier-def" in result.issues[0].title
+
     def test_validate_detects_multiple_issues(
         self, validator: DeploymentValidator, mock_commands: MagicMock
     ) -> None:
@@ -249,10 +340,11 @@ def test_validate_detects_multiple_issues(
 
         assert result.is_clean is False
         assert len(result.issues) == 3
-        # Should have 2 ERRORs (failed job + crashloop), 1 WARNING (pending)
+        # Should have 1 ERROR (crashloop), 2 WARNINGs (init job + pending)
+        # postgres-verifier is an init job so it's a WARNING, not ERROR
         severities = [issue.severity for issue in result.issues]
-        assert severities.count(ValidationSeverity.ERROR) == 2
-        assert ValidationSeverity.WARNING in severities
+        assert severities.count(ValidationSeverity.ERROR) == 1
+        assert severities.count(ValidationSeverity.WARNING) == 2
 
     def test_display_results_clean(
         self,

From 354bcca881d0c53ce480d2aebeb25e9fb7b90d10 Mon Sep 17 00:00:00 2001
From: jc <jc>
Date: Wed, 3 Dec 2025 21:07:21 -0500
Subject: [PATCH 2/5] refactor(api): standardize health endpoint responses with
 Pydantic models

- Add ServiceStatus enum (HEALTHY, UNHEALTHY, DEGRADED, DISABLED) for consistent terminology
- Create HealthCheckService with dependency injection pattern
- Refactor health router to use typed Pydantic response models
- Add workflow schemas (WorkflowStartRequest/Response, etc.)
- Improve workflows router with proper response_model annotations
- Add comprehensive docstrings and OpenAPI documentation
- Add unit tests for health endpoints (13 tests) and HealthCheckService (21 tests)
- All 490 unit tests passing
---
 src/app/api/http/routers/health.py            | 384 ++++++++----------
 src/app/api/http/routers/workflows.py         | 171 ++++++--
 src/app/api/http/schemas/__init__.py          |  61 +++
 src/app/api/http/schemas/health.py            | 237 +++++++++++
 src/app/api/http/schemas/workflows.py         | 128 ++++++
 src/app/core/services/health_service.py       | 350 ++++++++++++++++
 .../api/http/routers/test_health_endpoints.py | 316 ++++++++++++++
 .../app/core/services/test_health_service.py  | 378 +++++++++++++++++
 8 files changed, 1783 insertions(+), 242 deletions(-)
 create mode 100644 src/app/api/http/schemas/health.py
 create mode 100644 src/app/api/http/schemas/workflows.py
 create mode 100644 src/app/core/services/health_service.py
 create mode 100644 tests/unit/app/api/http/routers/test_health_endpoints.py
 create mode 100644 tests/unit/app/core/services/test_health_service.py

diff --git a/src/app/api/http/routers/health.py b/src/app/api/http/routers/health.py
index f673986..2787ed9 100644
--- a/src/app/api/http/routers/health.py
+++ b/src/app/api/http/routers/health.py
@@ -1,259 +1,235 @@
-"""Health check endpoints router for monitoring service availability."""
+"""Health check endpoints router for monitoring service availability.
 
-from typing import Any
+This module provides HTTP endpoints for health checking the application
+and its dependencies. These endpoints are used by:
 
-from fastapi import APIRouter, Request
+- Kubernetes liveness probes (/health)
+- Kubernetes readiness probes (/health/ready)
+- Monitoring systems
+- Load balancers
+
+Endpoint Summary:
+    GET /health         - Liveness probe (app is running)
+    GET /health/ready   - Readiness probe (all services operational)
+    GET /health/database - Database-specific health check
+    GET /health/redis   - Redis-specific health check
+    GET /health/temporal - Temporal-specific health check
+"""
+
+from __future__ import annotations
+
+from fastapi import APIRouter, Depends, Request
 from starlette.responses import JSONResponse
 
 from src.app.api.http.app_data import ApplicationDependencies
+from src.app.api.http.schemas.health import (
+    DatabaseHealthDetailed,
+    HealthCheckError,
+    LivenessResponse,
+    OverallStatus,
+    ReadinessResponse,
+    RedisHealthDetailed,
+    ServiceStatus,
+    TemporalHealthDetailed,
+)
+from src.app.core.services.health_service import HealthCheckService
 from src.app.runtime.context import get_config
 
 router = APIRouter(prefix="/health", tags=["health"])
 
 
-@router.get("")
-async def health() -> dict[str, str]:
+# =============================================================================
+# Dependencies
+# =============================================================================
+
+
+def get_health_service(request: Request) -> HealthCheckService:
+    """Get the health check service instance.
+
+    This dependency creates a HealthCheckService with the application's
+    dependencies and configuration.
+    """
+    app_deps: ApplicationDependencies = request.app.state.app_dependencies
+    config = get_config()
+    return HealthCheckService(app_deps, config)
+
+
+# =============================================================================
+# Liveness Probe
+# =============================================================================
+
+
+@router.get(
+    "",
+    response_model=LivenessResponse,
+    summary="Liveness probe",
+    description="Basic health check - returns 200 if the application process is running.",
+)
+async def health() -> LivenessResponse:
     """Basic health check endpoint - checks if app is running.
 
     This is a liveness probe that returns 200 OK as long as the application
     process is running. It does not check dependencies.
-    """
-    return {"status": "healthy", "service": "api"}
-
 
-@router.get("/ready", response_model=None)
-async def readiness(request: Request) -> dict[str, Any] | JSONResponse:
+    Use this endpoint for Kubernetes liveness probes.
+    """
+    return LivenessResponse()
+
+
+# =============================================================================
+# Readiness Probe
+# =============================================================================
+
+
+@router.get(
+    "/ready",
+    response_model=ReadinessResponse,
+    responses={
+        200: {"description": "All services are ready"},
+        503: {
+            "description": "One or more services are not ready",
+            "model": ReadinessResponse,
+        },
+    },
+    summary="Readiness probe",
+    description="Comprehensive readiness check - validates all service dependencies.",
+)
+async def readiness(
+    health_service: HealthCheckService = Depends(get_health_service),
+) -> ReadinessResponse | JSONResponse:
     """Comprehensive readiness check - validates all service dependencies.
 
-    Returns 200 if all services are ready, 503 if any service is unavailable.
+    Returns 200 if all critical services are ready, 503 if any are unavailable.
 
     This checks:
-    - Database connectivity
-    - Redis (non-critical, falls back to in-memory)
-    - Temporal (if enabled)
+    - Database connectivity (critical)
+    - Redis connectivity (non-critical, falls back to in-memory)
+    - Temporal connectivity (critical if enabled)
     - OIDC providers (critical in production only)
-    """
-    from pydantic import BaseModel
-
-    # Simple model for health check test data
-    class HealthCheckData(BaseModel):
-        test: bool
 
-    app_deps: ApplicationDependencies = request.app.state.app_dependencies
-    config = get_config()
-
-    checks: dict[str, Any] = {}
-    all_healthy = True
-
-    # Database health check
-    try:
-        db_healthy = app_deps.database_service.health_check()
-        checks["database"] = {
-            "status": "healthy" if db_healthy else "unhealthy",
-            "type": "postgresql" if "postgresql" in config.database.url else "sqlite",
-        }
-        if not db_healthy:
-            all_healthy = False
-    except Exception as e:
-        checks["database"] = {
-            "status": "unhealthy",
-            "error": str(e),
-        }
-        all_healthy = False
+    Use this endpoint for Kubernetes readiness probes.
+    """
+    result = await health_service.check_all()
 
-    # Redis health check
-    try:
-        if app_deps.redis_service:
-            redis_healthy = await app_deps.redis_service.health_check()
-        else:
-            redis_healthy = False  # Redis service not configured
-
-        checks["redis"] = {
-            "status": "healthy"
-            if redis_healthy
-            else "unhealthy"
-            if config.redis.enabled
-            else "degraded",
-            "type": "redis" if config.redis.enabled else "in-memory",
-        }
-        # Redis failure is not critical - we fall back to in-memory
-        # So we don't set all_healthy = False here
-    except Exception as e:
-        checks["redis"] = {
-            "status": "degraded",
-            "type": "in-memory",
-            "note": "Using in-memory storage",
-            "error": str(e) if config.redis.enabled else "Redis not configured",
-        }
-        # Redis failure is not critical - we fall back to in-memory
-
-    # Temporal health check
-    if config.temporal.enabled:
-        try:
-            temporal_healthy = await app_deps.temporal_service.health_check()
-            checks["temporal"] = {
-                "status": "healthy" if temporal_healthy else "unhealthy",
-                "url": app_deps.temporal_service.url,
-                "namespace": app_deps.temporal_service.namespace,
-            }
-            if not temporal_healthy:
-                all_healthy = False
-        except Exception as e:
-            checks["temporal"] = {
-                "status": "unhealthy",
-                "error": str(e),
-            }
-            all_healthy = False
-    else:
-        checks["temporal"] = {
-            "status": "disabled",
-            "note": "Temporal service is not enabled",
-        }
-
-    # OIDC providers check (verify JWKS endpoints are reachable)
-    oidc_checks = {}
-    if config.oidc.providers:
-        jwks_service = app_deps.jwks_service
-        for provider_name, provider_config in config.oidc.providers.items():
-            try:
-                # Try to fetch JWKS to verify provider is reachable
-                _ = await jwks_service.fetch_jwks(provider_config)
-                oidc_checks[provider_name] = {
-                    "status": "healthy",
-                    "issuer": provider_config.issuer,
-                }
-            except Exception as e:
-                oidc_checks[provider_name] = {
-                    "status": "unhealthy",
-                    "issuer": provider_config.issuer,
-                    "error": str(e),
-                }
-                # OIDC failures are not critical for non-production
-                if config.app.environment == "production":
-                    all_healthy = False
-
-    if oidc_checks:
-        checks["oidc_providers"] = oidc_checks
-
-    # Build response
-    response = {
-        "status": "ready" if all_healthy else "not_ready",
-        "environment": config.app.environment,
-        "checks": checks,
-    }
-
-    # Return 503 if not all services are healthy
-    if not all_healthy:
+    if result.status == OverallStatus.NOT_READY:
         return JSONResponse(
             status_code=503,
-            content=response,
+            content=result.model_dump(mode="json"),
         )
 
-    return response
+    return result
+
+
+# =============================================================================
+# Individual Service Health Checks
+# =============================================================================
+
 
+@router.get(
+    "/database",
+    response_model=DatabaseHealthDetailed,
+    responses={
+        503: {
+            "description": "Database is unhealthy",
+            "model": HealthCheckError,
+        },
+    },
+    summary="Database health check",
+    description="Database-specific health check with connection pool status.",
+)
+async def health_database(request: Request) -> DatabaseHealthDetailed | JSONResponse:
+    """Database-specific health check with connection pool status.
 
-@router.get("/database", response_model=None)
-async def health_database(request: Request) -> dict[str, Any] | JSONResponse:
-    """Database-specific health check with connection pool status."""
+    Returns detailed information about the database connection including
+    pool statistics when available.
+    """
     app_deps: ApplicationDependencies = request.app.state.app_dependencies
     config = get_config()
 
+    db_type = "postgresql" if "postgresql" in config.database.url else "sqlite"
+
     try:
         healthy = app_deps.database_service.health_check()
         pool_status = app_deps.database_service.get_pool_status()
 
-        return {
-            "status": "healthy" if healthy else "unhealthy",
-            "type": "postgresql" if "postgresql" in config.database.url else "sqlite",
-            "pool": pool_status,
-        }
+        # Convert pool_status to compatible type
+        pool: dict[str, int | str] | None = dict(pool_status) if pool_status else None
+
+        return DatabaseHealthDetailed(
+            status=ServiceStatus.HEALTHY if healthy else ServiceStatus.UNHEALTHY,
+            type=db_type,
+            pool=pool,
+        )
     except Exception as e:
         return JSONResponse(
             status_code=503,
-            content={
-                "status": "unhealthy",
-                "error": str(e),
-                "error_type": type(e).__name__,
-            },
+            content=HealthCheckError(
+                status=ServiceStatus.UNHEALTHY,
+                error=str(e),
+                error_type=type(e).__name__,
+            ).model_dump(mode="json"),
         )
 
 
-@router.get("/redis", response_model=None)
-async def health_redis(request: Request) -> dict[str, Any] | JSONResponse:
+@router.get(
+    "/redis",
+    response_model=RedisHealthDetailed,
+    responses={
+        503: {
+            "description": "Redis is unhealthy",
+            "model": HealthCheckError,
+        },
+    },
+    summary="Redis health check",
+    description="Redis-specific health check with server information.",
+)
+async def health_redis(
+    health_service: HealthCheckService = Depends(get_health_service),
+) -> RedisHealthDetailed | JSONResponse:
     """Redis-specific health check using actual Redis operations.
 
     This performs a real test operation (set/get/delete) to verify
     Redis is functioning correctly, not just a simple PING.
-    """
-    app_deps: ApplicationDependencies = request.app.state.app_dependencies
-    config = get_config()
 
-    if not config.redis.enabled:
-        return {
-            "status": "disabled",
-            "type": "in-memory",
-            "note": "Redis is not enabled, using in-memory storage",
-        }
+    Returns detailed Redis server information when available.
+    """
+    result = await health_service.check_redis_detailed()
 
-    try:
-        # Use the health_check method from RedisService
-        if app_deps.redis_service:
-            healthy = await app_deps.redis_service.health_check()
-            # Optionally get more detailed info
-            info = await app_deps.redis_service.get_info()
-        else:
-            healthy = False  # Redis service not configured
-            info = None
-
-        result: dict[str, Any] = {
-            "status": "healthy" if healthy else "unhealthy",
-            "type": "redis",
-            "url": config.redis.url,
-        }
-
-        if info:
-            result["info"] = info
-
-        return result
-    except Exception as e:
+    if result.status == ServiceStatus.UNHEALTHY:
         return JSONResponse(
             status_code=503,
-            content={
-                "status": "unhealthy",
-                "type": "redis",
-                "error": str(e),
-                "error_type": type(e).__name__,
-                "fallback": "in-memory storage",
-            },
+            content=result.model_dump(mode="json"),
         )
 
+    return result
+
+
+@router.get(
+    "/temporal",
+    response_model=TemporalHealthDetailed,
+    responses={
+        503: {
+            "description": "Temporal is unhealthy",
+            "model": HealthCheckError,
+        },
+    },
+    summary="Temporal health check",
+    description="Temporal workflow service health check.",
+)
+async def health_temporal(
+    health_service: HealthCheckService = Depends(get_health_service),
+) -> TemporalHealthDetailed | JSONResponse:
+    """Temporal-specific health check.
+
+    Returns connection status and configuration details for the
+    Temporal workflow service.
+    """
+    result = await health_service.check_temporal_detailed()
 
-@router.get("/temporal", response_model=None)
-async def health_temporal(request: Request) -> dict[str, Any] | JSONResponse:
-    """Temporal-specific health check."""
-    app_deps: ApplicationDependencies = request.app.state.app_dependencies
-
-    if not app_deps.temporal_service.is_enabled:
-        return {
-            "status": "disabled",
-            "note": "Temporal service is not enabled",
-        }
-
-    try:
-        healthy = await app_deps.temporal_service.health_check()
-
-        return {
-            "status": "healthy" if healthy else "unhealthy",
-            "url": app_deps.temporal_service.url,
-            "namespace": app_deps.temporal_service.namespace,
-            "task_queue": app_deps.temporal_service.task_queue,
-        }
-    except Exception as e:
+    if result.status == ServiceStatus.UNHEALTHY:
         return JSONResponse(
             status_code=503,
-            content={
-                "status": "unhealthy",
-                "error": str(e),
-                "error_type": type(e).__name__,
-            },
+            content=result.model_dump(mode="json"),
         )
+
+    return result
diff --git a/src/app/api/http/routers/workflows.py b/src/app/api/http/routers/workflows.py
index 84bc635..78dd23c 100644
--- a/src/app/api/http/routers/workflows.py
+++ b/src/app/api/http/routers/workflows.py
@@ -1,64 +1,159 @@
-# api/routes/workflows.py
+"""Temporal workflow management endpoints.
+
+This module provides HTTP endpoints for managing Temporal workflows,
+including starting workflows, sending signals, and querying workflow state.
+
+Endpoint Summary:
+    POST /workflows/start      - Start a new workflow
+    POST /workflows/{id}/signal/{name} - Send signal to a running workflow
+    GET  /workflows/{id}       - Query workflow state
+"""
+
+from __future__ import annotations
+
 import uuid
-from typing import Any
 
 from fastapi import APIRouter, Depends, HTTPException
-from pydantic import BaseModel
 
 from src.app.api.http.deps import get_temporal_service
+from src.app.api.http.schemas.workflows import (
+    WorkflowNotFoundError,
+    WorkflowQueryResponse,
+    WorkflowSignalRequest,
+    WorkflowSignalResponse,
+    WorkflowStartRequest,
+    WorkflowStartResponse,
+)
 from src.app.core.services import TemporalClientService
 
-router = APIRouter(prefix="/workflows")
+router = APIRouter(prefix="/workflows", tags=["workflows"])
 
 
-class StartRequest(BaseModel):
-    workflow: str  # e.g., "OrderWorkflow"
-    args: list[Any] = []
-    kwargs: dict[str, Any] = {}
-    id: str | None = None
-    task_queue: str = "app"
+@router.post(
+    "/start",
+    response_model=WorkflowStartResponse,
+    responses={
+        404: {
+            "description": "Workflow type not found",
+            "model": WorkflowNotFoundError,
+        },
+    },
+    summary="Start a new workflow",
+    description="Start a new Temporal workflow execution with the specified parameters.",
+)
+async def start_workflow(
+    request: WorkflowStartRequest,
+    client_service: TemporalClientService = Depends(get_temporal_service),
+) -> WorkflowStartResponse:
+    """Start a new Temporal workflow.
 
+    Looks up the workflow class by name from the worker.workflows module
+    and starts a new execution with the provided arguments.
+
+    Args:
+        request: Workflow start parameters including workflow name and arguments
+
+    Returns:
+        WorkflowStartResponse with the workflow ID and run ID
+
+    Raises:
+        HTTPException: 404 if the workflow type is not found
+    """
+    # Dynamic import of workflow class
+    try:
+        workflow_module = __import__("worker.workflows", fromlist=[request.workflow])
+        workflow_class = getattr(workflow_module, request.workflow, None)
+    except ImportError:
+        workflow_class = None
+
+    if not workflow_class:
+        raise HTTPException(
+            status_code=404,
+            detail=f"Workflow type '{request.workflow}' not found",
+        )
 
-@router.post("/start")
-async def start(
-    req: StartRequest,
-    client_service: TemporalClientService = Depends(get_temporal_service),
-) -> dict[str, str | None]:
-    wf = getattr(
-        __import__("worker.workflows", fromlist=[req.workflow]), req.workflow, None
-    )
     client = await client_service.get_client()
-    if not wf:
-        raise HTTPException(404, "Workflow type not found")
+
+    # Generate workflow ID if not provided
+    workflow_id = request.id or f"{request.workflow.lower()}-{uuid.uuid4()}"
 
     handle = await client.start_workflow(
-        wf.run,
-        *req.args,
-        **req.kwargs,
-        id=req.id or f"{req.workflow.lower()}-{uuid.uuid4()}",
-        task_queue=req.task_queue,
+        workflow_class.run,
+        *request.args,
+        **request.kwargs,
+        id=workflow_id,
+        task_queue=request.task_queue,
+    )
+
+    return WorkflowStartResponse(
+        workflow_id=handle.id,
+        run_id=handle.first_execution_run_id,
     )
-    return {"workflow_id": handle.id, "run_id": handle.first_execution_run_id}
 
 
-@router.post("/{workflow_id}/signal/{signal_name}")
-async def signal(
+@router.post(
+    "/{workflow_id}/signal/{signal_name}",
+    response_model=WorkflowSignalResponse,
+    summary="Signal a running workflow",
+    description="Send a signal to a running workflow to trigger state changes or actions.",
+)
+async def signal_workflow(
     workflow_id: str,
     signal_name: str,
-    payload: dict[str, Any],
+    request: WorkflowSignalRequest,
     client_service: TemporalClientService = Depends(get_temporal_service),
-) -> dict[str, bool]:
+) -> WorkflowSignalResponse:
+    """Send a signal to a running workflow.
+
+    Signals are async messages that workflows can handle to trigger
+    state changes, continue waiting operations, or perform actions.
+
+    Args:
+        workflow_id: ID of the running workflow
+        signal_name: Name of the signal to send
+        request: Signal payload
+
+    Returns:
+        WorkflowSignalResponse indicating success
+    """
     client = await client_service.get_client()
-    h = client.get_workflow_handle(workflow_id)
-    await h.signal(signal_name, **payload)
-    return {"ok": True}
+    handle = client.get_workflow_handle(workflow_id)
+
+    await handle.signal(signal_name, **request.payload)
+
+    return WorkflowSignalResponse(
+        success=True,
+        message=f"Signal '{signal_name}' sent to workflow '{workflow_id}'",
+    )
 
 
-@router.get("/{workflow_id}")
-async def read(
+@router.get(
+    "/{workflow_id}",
+    response_model=WorkflowQueryResponse,
+    summary="Query workflow state",
+    description="Query the current state of a workflow using the 'state' query handler.",
+)
+async def query_workflow(
     workflow_id: str,
     client_service: TemporalClientService = Depends(get_temporal_service),
-) -> Any:
+) -> WorkflowQueryResponse:
+    """Query a workflow's current state.
+
+    Uses the 'state' query handler defined in the workflow to retrieve
+    the current workflow state.
+
+    Args:
+        workflow_id: ID of the workflow to query
+
+    Returns:
+        WorkflowQueryResponse with the workflow state
+    """
     client = await client_service.get_client()
-    h = client.get_workflow_handle(workflow_id)
-    return await h.query("state")
+    handle = client.get_workflow_handle(workflow_id)
+
+    state = await handle.query("state")
+
+    return WorkflowQueryResponse(
+        state=state,
+        workflow_id=workflow_id,
+    )
diff --git a/src/app/api/http/schemas/__init__.py b/src/app/api/http/schemas/__init__.py
index e69de29..465162a 100644
--- a/src/app/api/http/schemas/__init__.py
+++ b/src/app/api/http/schemas/__init__.py
@@ -0,0 +1,61 @@
+"""API schema definitions for HTTP endpoints.
+
+This package contains Pydantic models for request/response schemas
+used by the HTTP API layer.
+
+Modules:
+    health: Health check response models
+    workflows: Temporal workflow request/response models
+"""
+
+from src.app.api.http.schemas.health import (
+    AllServicesHealth,
+    DatabaseHealth,
+    DatabaseHealthDetailed,
+    HealthCheckError,
+    LivenessResponse,
+    OIDCProviderHealth,
+    OverallStatus,
+    ReadinessResponse,
+    RedisHealth,
+    RedisHealthDetailed,
+    ServiceHealthBase,
+    ServiceStatus,
+    TemporalHealth,
+    TemporalHealthDetailed,
+)
+from src.app.api.http.schemas.workflows import (
+    WorkflowExecutionError,
+    WorkflowNotFoundError,
+    WorkflowQueryResponse,
+    WorkflowSignalRequest,
+    WorkflowSignalResponse,
+    WorkflowStartRequest,
+    WorkflowStartResponse,
+)
+
+__all__ = [
+    # Health schemas
+    "ServiceStatus",
+    "OverallStatus",
+    "ServiceHealthBase",
+    "DatabaseHealth",
+    "DatabaseHealthDetailed",
+    "RedisHealth",
+    "RedisHealthDetailed",
+    "TemporalHealth",
+    "TemporalHealthDetailed",
+    "OIDCProviderHealth",
+    "AllServicesHealth",
+    "ReadinessResponse",
+    "LivenessResponse",
+    "HealthCheckError",
+    # Workflow schemas
+    "WorkflowStartRequest",
+    "WorkflowStartResponse",
+    "WorkflowSignalRequest",
+    "WorkflowSignalResponse",
+    "WorkflowQueryResponse",
+    "WorkflowNotFoundError",
+    "WorkflowExecutionError",
+]
diff --git a/src/app/api/http/schemas/health.py b/src/app/api/http/schemas/health.py
new file mode 100644
index 0000000..6c793ff
--- /dev/null
+++ b/src/app/api/http/schemas/health.py
@@ -0,0 +1,237 @@
+"""Health check response schemas.
+
+This module defines the Pydantic models for health check API responses,
+providing a consistent and well-documented interface for health endpoints.
+
+Status Terminology:
+    - healthy: Service is fully operational
+    - unhealthy: Service is not operational (critical failure)
+    - degraded: Service is partially operational or using fallback
+    - disabled: Service is intentionally not enabled in configuration
+"""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Annotated
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class ServiceStatus(str, Enum):
+    """Standard status values for service health checks.
+
+    These statuses provide a consistent vocabulary across all health
+    check responses:
+
+    - HEALTHY: Service is fully operational and responding normally
+    - UNHEALTHY: Service is not operational (connection failed, errors, etc.)
+    - DEGRADED: Service is partially operational or using a fallback mechanism
+    - DISABLED: Service is intentionally not enabled in the configuration
+    """
+
+    HEALTHY = "healthy"
+    UNHEALTHY = "unhealthy"
+    DEGRADED = "degraded"
+    DISABLED = "disabled"
+
+
+class OverallStatus(str, Enum):
+    """Overall application readiness status.
+
+    - READY: All critical services are operational
+    - NOT_READY: One or more critical services are not operational
+    """
+
+    READY = "ready"
+    NOT_READY = "not_ready"
+
+
+# =============================================================================
+# Base Models
+# =============================================================================
+
+
+class ServiceHealthBase(BaseModel):
+    """Base model for individual service health check results."""
+
+    model_config = ConfigDict(use_enum_values=True)
+
+    status: ServiceStatus = Field(description="Current health status of the service")
+    note: str | None = Field(
+        default=None,
+        description="Optional additional context about the status",
+    )
+    error: str | None = Field(
+        default=None,
+        description="Error message if status is unhealthy",
+    )
+
+
+# =============================================================================
+# Database Health Models
+# =============================================================================
+
+
+class DatabaseHealth(ServiceHealthBase):
+    """Health check result for the database service."""
+
+    type: Annotated[
+        str,
+        Field(description="Database type (postgresql, sqlite)"),
+    ]
+
+
+class DatabaseHealthDetailed(DatabaseHealth):
+    """Detailed health check result for the database with pool info."""
+
+    pool: dict[str, int | str] | None = Field(
+        default=None,
+        description="Connection pool statistics",
+    )
+
+
+# =============================================================================
+# Redis Health Models
+# =============================================================================
+
+
+class RedisHealth(ServiceHealthBase):
+    """Health check result for the Redis service."""
+
+    type: Annotated[
+        str,
+        Field(description="Storage type (redis, in-memory)"),
+    ]
+
+
+class RedisHealthDetailed(RedisHealth):
+    """Detailed health check result for Redis with server info."""
+
+    url: str | None = Field(
+        default=None,
+        description="Redis connection URL (masked)",
+    )
+    info: dict[str, str | int] | None = Field(
+        default=None,
+        description="Redis server information",
+    )
+    fallback: str | None = Field(
+        default=None,
+        description="Fallback mechanism being used if Redis is unavailable",
+    )
+
+
+# =============================================================================
+# Temporal Health Models
+# =============================================================================
+
+
+class TemporalHealth(ServiceHealthBase):
+    """Health check result for the Temporal workflow service."""
+
+    url: str | None = Field(
+        default=None,
+        description="Temporal server URL",
+    )
+    namespace: str | None = Field(
+        default=None,
+        description="Temporal namespace",
+    )
+
+
+class TemporalHealthDetailed(TemporalHealth):
+    """Detailed health check result for Temporal."""
+
+    task_queue: str | None = Field(
+        default=None,
+        description="Temporal task queue name",
+    )
+
+
+# =============================================================================
+# OIDC Provider Health Models
+# =============================================================================
+
+
+class OIDCProviderHealth(ServiceHealthBase):
+    """Health check result for an individual OIDC provider."""
+
+    issuer: str = Field(description="OIDC issuer URL")
+
+
+class OIDCProvidersHealth(BaseModel):
+    """Health check results for all configured OIDC providers."""
+
+    model_config = ConfigDict(extra="allow")
+
+    # Dynamic fields for each provider (e.g., google, microsoft, keycloak)
+    # Using extra="allow" to handle dynamic provider names
+
+
+# =============================================================================
+# Aggregate Health Check Models
+# =============================================================================
+
+
+class AllServicesHealth(BaseModel):
+    """Aggregated health check results for all services."""
+
+    model_config = ConfigDict(use_enum_values=True)
+
+    database: DatabaseHealth
+    redis: RedisHealth
+    temporal: TemporalHealth
+    oidc_providers: dict[str, OIDCProviderHealth] | None = Field(
+        default=None,
+        description="Health status of configured OIDC providers",
+    )
+
+
+class ReadinessResponse(BaseModel):
+    """Response model for the /health/ready endpoint.
+
+    This is the primary health check endpoint used by Kubernetes
+    readiness probes to determine if the application is ready
+    to receive traffic.
+    """
+
+    model_config = ConfigDict(use_enum_values=True)
+
+    status: OverallStatus = Field(description="Overall application readiness status")
+    environment: str = Field(
+        description="Current deployment environment (development, production, etc.)"
+    )
+    checks: AllServicesHealth = Field(
+        description="Individual service health check results"
+    )
+
+
+class LivenessResponse(BaseModel):
+    """Response model for the /health endpoint (liveness probe).
+
+    This is a simple health check that only verifies the application
+    process is running. It does not check dependencies.
+    """
+
+    status: Annotated[
+        str,
+        Field(description="Always 'healthy' if the app is running"),
+    ] = "healthy"
+    service: Annotated[
+        str,
+        Field(description="Service identifier"),
+    ] = "api"
+
+
+# =============================================================================
+# Error Response Models
+# =============================================================================
+
+
+class HealthCheckError(BaseModel):
+    """Error response for health check endpoints."""
+
+    status: ServiceStatus = ServiceStatus.UNHEALTHY
+    error: str = Field(description="Error message")
+    error_type: str = Field(description="Exception class name")
diff --git a/src/app/api/http/schemas/workflows.py b/src/app/api/http/schemas/workflows.py
new file mode 100644
index 0000000..f77adb4
--- /dev/null
+++ b/src/app/api/http/schemas/workflows.py
@@ -0,0 +1,128 @@
+"""Pydantic schemas for workflow API endpoints.
+
+This module defines request and response models for the Temporal workflow
+management API endpoints.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+# =============================================================================
+# Request Models
+# =============================================================================
+
+
+class WorkflowStartRequest(BaseModel):
+    """Request model for starting a new workflow.
+
+    Example:
+        ```json
+        {
+            "workflow": "OrderWorkflow",
+            "args": [{"order_id": "123"}],
+            "task_queue": "orders"
+        }
+        ```
+    """
+
+    workflow: str = Field(
+        description="Name of the workflow class to execute (e.g., 'OrderWorkflow')"
+    )
+    args: list[Any] = Field(
+        default_factory=list,
+        description="Positional arguments to pass to the workflow",
+    )
+    kwargs: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Keyword arguments to pass to the workflow",
+    )
+    id: str | None = Field(
+        default=None,
+        description="Custom workflow ID. If not provided, one is auto-generated.",
+    )
+    task_queue: str = Field(
+        default="app",
+        description="Temporal task queue to use for the workflow",
+    )
+
+
+class WorkflowSignalRequest(BaseModel):
+    """Request model for signaling a workflow.
+
+    Signals are async messages that can be sent to running workflows
+    to trigger state changes or actions.
+    """
+
+    payload: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Data payload to send with the signal",
+    )
+
+
+# =============================================================================
+# Response Models
+# =============================================================================
+
+
+class WorkflowStartResponse(BaseModel):
+    """Response model for a successfully started workflow."""
+
+    workflow_id: str = Field(description="Unique identifier for the workflow")
+    run_id: str | None = Field(
+        default=None,
+        description="Temporal run ID for this workflow execution",
+    )
+
+
+class WorkflowSignalResponse(BaseModel):
+    """Response model for a successful signal operation."""
+
+    success: bool = Field(
+        default=True,
+        description="Whether the signal was sent successfully",
+    )
+    message: str = Field(
+        default="Signal sent",
+        description="Status message",
+    )
+
+
+class WorkflowQueryResponse(BaseModel):
+    """Response model for a workflow query.
+
+    The actual structure depends on the workflow's query handler,
+    but this provides a typed wrapper.
+    """
+
+    state: Any = Field(description="Current workflow state from the query")
+    workflow_id: str | None = Field(
+        default=None,
+        description="Workflow ID that was queried",
+    )
+
+
+class WorkflowNotFoundError(BaseModel):
+    """Error response when a workflow type is not found."""
+
+    detail: str = Field(description="Error message describing what went wrong")
+    available_workflows: list[str] | None = Field(
+        default=None,
+        description="List of available workflow types",
+    )
+
+
+class WorkflowExecutionError(BaseModel):
+    """Error response for workflow execution failures."""
+
+    detail: str = Field(description="Error message")
+    workflow_id: str | None = Field(
+        default=None,
+        description="Workflow ID if available",
+    )
+    error_type: str | None = Field(
+        default=None,
+        description="Type of error that occurred",
+    )
diff --git a/src/app/core/services/health_service.py b/src/app/core/services/health_service.py
new file mode 100644
index 0000000..3630562
--- /dev/null
+++ b/src/app/core/services/health_service.py
@@ -0,0 +1,350 @@
+"""Health check service for application dependencies.
+
+This module provides a centralized service for performing health checks
+on all application dependencies (database, Redis, Temporal, OIDC providers).
+
+The service abstracts health check logic from the HTTP layer, making it
+testable and reusable across different contexts (HTTP endpoints, CLI tools,
+background workers, etc.).
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from src.app.api.http.schemas.health import (
+    AllServicesHealth,
+    DatabaseHealth,
+    OIDCProviderHealth,
+    OverallStatus,
+    ReadinessResponse,
+    RedisHealth,
+    RedisHealthDetailed,
+    ServiceStatus,
+    TemporalHealth,
+    TemporalHealthDetailed,
+)
+
+if TYPE_CHECKING:
+    from src.app.api.http.app_data import ApplicationDependencies
+    from src.app.runtime.config.config_data import ConfigData
+
+
+@dataclass
+class HealthCheckResult:
+    """Result of an individual health check.
+
+    Attributes:
+        status: The health status of the service
+        details: Additional details about the check (type, url, etc.)
+        is_critical: Whether this service failure should mark the app as not ready
+    """
+
+    status: ServiceStatus
+    details: dict[str, str | int | bool | None]
+    is_critical: bool = True
+
+
+class HealthCheckService:
+    """Service for performing health checks on application dependencies.
+
+    This service encapsulates all health check logic and provides a clean
+    interface for checking individual services or all services at once.
+
+    Example:
+        ```python
+        health_service = HealthCheckService(app_deps, config)
+        result = await health_service.check_all()
+        if result.status == OverallStatus.READY:
+            print("All systems go!")
+        ```
+    """
+
+    def __init__(
+        self,
+        app_deps: ApplicationDependencies,
+        config: ConfigData,
+    ) -> None:
+        """Initialize the health check service.
+
+        Args:
+            app_deps: Application dependencies container
+            config: Application configuration
+        """
+        self._app_deps = app_deps
+        self._config = config
+
+    # =========================================================================
+    # Public API
+    # =========================================================================
+
+    async def check_all(self) -> ReadinessResponse:
+        """Perform health checks on all services.
+
+        Returns:
+            ReadinessResponse with status of all services
+        """
+        database = await self.check_database()
+        redis = await self.check_redis()
+        temporal = await self.check_temporal()
+        oidc_providers = await self.check_oidc_providers()
+
+        # Determine overall status based on critical services
+        all_healthy = self._evaluate_overall_health(
+            database=database,
+            redis=redis,
+            temporal=temporal,
+            oidc_providers=oidc_providers,
+        )
+
+        return ReadinessResponse(
+            status=OverallStatus.READY if all_healthy else OverallStatus.NOT_READY,
+            environment=self._config.app.environment,
+            checks=AllServicesHealth(
+                database=database,
+                redis=redis,
+                temporal=temporal,
+                oidc_providers=oidc_providers if oidc_providers else None,
+            ),
+        )
+
+    async def check_database(self) -> DatabaseHealth:
+        """Check database connectivity.
+
+        Returns:
+            DatabaseHealth with status and database type
+        """
+        db_type = self._get_database_type()
+
+        try:
+            healthy = self._app_deps.database_service.health_check()
+            return DatabaseHealth(
+                status=ServiceStatus.HEALTHY if healthy else ServiceStatus.UNHEALTHY,
+                type=db_type,
+            )
+        except Exception as e:
+            return DatabaseHealth(
+                status=ServiceStatus.UNHEALTHY,
+                type=db_type,
+                error=str(e),
+            )
+
+    async def check_redis(self) -> RedisHealth:
+        """Check Redis connectivity.
+
+        Redis is a non-critical service - if unavailable, the application
+        falls back to in-memory storage.
+
+        Returns:
+            RedisHealth with status and storage type
+        """
+        if not self._config.redis.enabled:
+            return RedisHealth(
+                status=ServiceStatus.DISABLED,
+                type="in-memory",
+                note="Redis is not enabled, using in-memory storage",
+            )
+
+        try:
+            if self._app_deps.redis_service:
+                healthy = await self._app_deps.redis_service.health_check()
+                return RedisHealth(
+                    status=ServiceStatus.HEALTHY if healthy else ServiceStatus.DEGRADED,
+                    type="redis" if healthy else "in-memory",
+                    note=None if healthy else "Falling back to in-memory storage",
+                )
+            else:
+                return RedisHealth(
+                    status=ServiceStatus.DEGRADED,
+                    type="in-memory",
+                    note="Redis service not initialized, using in-memory storage",
+                )
+        except Exception as e:
+            return RedisHealth(
+                status=ServiceStatus.DEGRADED,
+                type="in-memory",
+                note="Using in-memory storage due to Redis error",
+                error=str(e),
+            )
+
+    async def check_redis_detailed(self) -> RedisHealthDetailed:
+        """Check Redis with detailed server information.
+
+        Returns:
+            RedisHealthDetailed with server info if available
+        """
+        if not self._config.redis.enabled:
+            return RedisHealthDetailed(
+                status=ServiceStatus.DISABLED,
+                type="in-memory",
+                note="Redis is not enabled, using in-memory storage",
+            )
+
+        try:
+            if self._app_deps.redis_service:
+                healthy = await self._app_deps.redis_service.health_check()
+                info = (
+                    await self._app_deps.redis_service.get_info() if healthy else None
+                )
+
+                return RedisHealthDetailed(
+                    status=ServiceStatus.HEALTHY
+                    if healthy
+                    else ServiceStatus.UNHEALTHY,
+                    type="redis",
+                    url=self._config.redis.url,
+                    info=info,
+                )
+            else:
+                return RedisHealthDetailed(
+                    status=ServiceStatus.DEGRADED,
+                    type="in-memory",
+                    note="Redis service not initialized",
+                    fallback="in-memory storage",
+                )
+        except Exception as e:
+            return RedisHealthDetailed(
+                status=ServiceStatus.UNHEALTHY,
+                type="redis",
+                error=str(e),
+                fallback="in-memory storage",
+            )
+
+    async def check_temporal(self) -> TemporalHealth:
+        """Check Temporal workflow service connectivity.
+
+        Returns:
+            TemporalHealth with status and connection details
+        """
+        if not self._config.temporal.enabled:
+            return TemporalHealth(
+                status=ServiceStatus.DISABLED,
+                note="Temporal service is not enabled",
+            )
+
+        try:
+            healthy = await self._app_deps.temporal_service.health_check()
+            return TemporalHealth(
+                status=ServiceStatus.HEALTHY if healthy else ServiceStatus.UNHEALTHY,
+                url=self._app_deps.temporal_service.url,
+                namespace=self._app_deps.temporal_service.namespace,
+            )
+        except Exception as e:
+            return TemporalHealth(
+                status=ServiceStatus.UNHEALTHY,
+                error=str(e),
+            )
+
+    async def check_temporal_detailed(self) -> TemporalHealthDetailed:
+        """Check Temporal with detailed configuration info.
+
+        Returns:
+            TemporalHealthDetailed with task queue info
+        """
+        if not self._config.temporal.enabled:
+            return TemporalHealthDetailed(
+                status=ServiceStatus.DISABLED,
+                note="Temporal service is not enabled",
+            )
+
+        try:
+            healthy = await self._app_deps.temporal_service.health_check()
+            return TemporalHealthDetailed(
+                status=ServiceStatus.HEALTHY if healthy else ServiceStatus.UNHEALTHY,
+                url=self._app_deps.temporal_service.url,
+                namespace=self._app_deps.temporal_service.namespace,
+                task_queue=self._app_deps.temporal_service.task_queue,
+            )
+        except Exception as e:
+            return TemporalHealthDetailed(
+                status=ServiceStatus.UNHEALTHY,
+                error=str(e),
+            )
+
+    async def check_oidc_providers(self) -> dict[str, OIDCProviderHealth] | None:
+        """Check all configured OIDC providers.
+
+        Returns:
+            Dict of provider name to health status, or None if no providers configured
+        """
+        if not self._config.oidc.providers:
+            return None
+
+        results: dict[str, OIDCProviderHealth] = {}
+        jwks_service = self._app_deps.jwks_service
+
+        for provider_name, provider_config in self._config.oidc.providers.items():
+            try:
+                # Verify provider is reachable by fetching JWKS
+                await jwks_service.fetch_jwks(provider_config)
+                results[provider_name] = OIDCProviderHealth(
+                    status=ServiceStatus.HEALTHY,
+                    issuer=provider_config.issuer,
+                )
+            except Exception as e:
+                results[provider_name] = OIDCProviderHealth(
+                    status=ServiceStatus.UNHEALTHY,
+                    issuer=provider_config.issuer,
+                    error=str(e),
+                )
+
+        return results if results else None
+
+    # =========================================================================
+    # Private Helpers
+    # =========================================================================
+
+    def _get_database_type(self) -> str:
+        """Determine database type from connection URL."""
+        if "postgresql" in self._config.database.url:
+            return "postgresql"
+        elif "sqlite" in self._config.database.url:
+            return "sqlite"
+        else:
+            return "unknown"
+
+    def _evaluate_overall_health(
+        self,
+        database: DatabaseHealth,
+        redis: RedisHealth,
+        temporal: TemporalHealth,
+        oidc_providers: dict[str, OIDCProviderHealth] | None,
+    ) -> bool:
+        """Evaluate overall health based on individual service checks.
+
+        Critical services:
+        - Database: Always critical
+        - Temporal: Critical only if enabled
+        - OIDC: Critical only in production
+
+        Non-critical services:
+        - Redis: Falls back to in-memory, never critical
+
+        Args:
+            database: Database health check result
+            redis: Redis health check result
+            temporal: Temporal health check result
+            oidc_providers: OIDC provider health check results
+
+        Returns:
+            True if all critical services are healthy
+        """
+        # Database is always critical
+        if database.status == ServiceStatus.UNHEALTHY:
+            return False
+
+        # Temporal is critical only if enabled
+        if temporal.status == ServiceStatus.UNHEALTHY:
+            return False
+
+        # OIDC is critical only in production
+        if oidc_providers and self._config.app.environment == "production":
+            for provider in oidc_providers.values():
+                if provider.status == ServiceStatus.UNHEALTHY:
+                    return False
+
+        # Redis is never critical (falls back to in-memory)
+        # So we don't check redis.status here
+
+        return True
diff --git a/tests/unit/app/api/http/routers/test_health_endpoints.py b/tests/unit/app/api/http/routers/test_health_endpoints.py
new file mode 100644
index 0000000..086eeef
--- /dev/null
+++ b/tests/unit/app/api/http/routers/test_health_endpoints.py
@@ -0,0 +1,316 @@
+"""Unit tests for health check API endpoints.
+
+These tests verify the health router behavior with mocked dependencies.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+
+from src.app.api.http.routers.health import get_health_service, router
+from src.app.api.http.schemas.health import (
+    AllServicesHealth,
+    DatabaseHealth,
+    LivenessResponse,
+    OverallStatus,
+    ReadinessResponse,
+    RedisHealth,
+    RedisHealthDetailed,
+    ServiceStatus,
+    TemporalHealth,
+    TemporalHealthDetailed,
+)
+from src.app.core.services.health_service import HealthCheckService
+
+
+@pytest.fixture
+def mock_config() -> MagicMock:
+    """Create a mock configuration."""
+    config = MagicMock()
+    config.app.environment = "development"
+    config.database.url = "sqlite:///test.db"
+    config.redis.enabled = True
+    config.redis.url = "redis://localhost:6379"
+    config.temporal.enabled = False
+    config.oidc.providers = {}
+    return config
+
+
+@pytest.fixture
+def mock_app_deps() -> MagicMock:
+    """Create mock application dependencies."""
+    deps = MagicMock()
+    deps.database_service.health_check.return_value = True
+    deps.database_service.get_pool_status.return_value = {"size": 5, "in_use": 1}
+    deps.redis_service = AsyncMock()
+    deps.redis_service.health_check = AsyncMock(return_value=True)
+    deps.redis_service.get_info = AsyncMock(
+        return_value={"version": "7.0.0", "connected_clients": 5}
+    )
+    deps.temporal_service.is_enabled = False
+    deps.temporal_service.health_check = AsyncMock(return_value=True)
+    deps.temporal_service.url = "localhost:7233"
+    deps.temporal_service.namespace = "default"
+    deps.temporal_service.task_queue = "app"
+    deps.jwks_service = AsyncMock()
+    return deps
+
+
+@pytest.fixture
+def mock_health_service(mock_app_deps: MagicMock, mock_config: MagicMock) -> MagicMock:
+    """Create a mock health check service."""
+    service = MagicMock(spec=HealthCheckService)
+
+    # Configure check_all to return a successful response
+    service.check_all = AsyncMock(
+        return_value=ReadinessResponse(
+            status=OverallStatus.READY,
+            environment="development",
+            checks=AllServicesHealth(
+                database=DatabaseHealth(status=ServiceStatus.HEALTHY, type="sqlite"),
+                redis=RedisHealth(status=ServiceStatus.DISABLED, type="in-memory"),
+                temporal=TemporalHealth(status=ServiceStatus.DISABLED),
+            ),
+        )
+    )
+
+    service.check_redis_detailed = AsyncMock(
+        return_value=RedisHealthDetailed(
+            status=ServiceStatus.HEALTHY,
+            type="redis",
+            url="redis://localhost:6379",
+            info={"version": "7.0.0"},
+        )
+    )
+
+    service.check_temporal_detailed = AsyncMock(
+        return_value=TemporalHealthDetailed(
+            status=ServiceStatus.DISABLED,
+            note="Temporal service is not enabled",
+        )
+    )
+
+    return service
+
+
+@pytest.fixture
+def test_app(mock_health_service: MagicMock) -> FastAPI:
+    """Create test FastAPI application."""
+    app = FastAPI()
+    app.include_router(router)
+
+    # Override the health service dependency
+    app.dependency_overrides[get_health_service] = lambda: mock_health_service
+
+    return app
+
+
+@pytest.fixture
+def client(test_app: FastAPI) -> TestClient:
+    """Create test client."""
+    return TestClient(test_app)
+
+
+class TestLivenessEndpoint:
+    """Tests for the /health liveness probe endpoint."""
+
+    def test_health_returns_healthy(self, client: TestClient) -> None:
+        """Test basic health check returns healthy status."""
+        response = client.get("/health")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "healthy"
+        assert data["service"] == "api"
+
+    def test_health_response_model(self, client: TestClient) -> None:
+        """Test health response matches LivenessResponse model."""
+        response = client.get("/health")
+        data = response.json()
+
+        # Validate against model
+        liveness = LivenessResponse(**data)
+        assert liveness.status == "healthy"
+        assert liveness.service == "api"
+
+
+class TestReadinessEndpoint:
+    """Tests for the /health/ready readiness probe endpoint."""
+
+    def test_readiness_returns_ready(
+        self, client: TestClient, mock_health_service: MagicMock
+    ) -> None:
+        """Test readiness check returns ready when all services healthy."""
+        response = client.get("/health/ready")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "ready"
+        assert data["environment"] == "development"
+        assert "checks" in data
+
+    def test_readiness_returns_503_when_not_ready(
+        self, client: TestClient, mock_health_service: MagicMock
+    ) -> None:
+        """Test readiness check returns 503 when critical services fail."""
+        # Configure service to return NOT_READY
+        mock_health_service.check_all = AsyncMock(
+            return_value=ReadinessResponse(
+                status=OverallStatus.NOT_READY,
+                environment="development",
+                checks=AllServicesHealth(
+                    database=DatabaseHealth(
+                        status=ServiceStatus.UNHEALTHY,
+                        type="postgresql",
+                        error="Connection refused",
+                    ),
+                    redis=RedisHealth(status=ServiceStatus.HEALTHY, type="redis"),
+                    temporal=TemporalHealth(status=ServiceStatus.DISABLED),
+                ),
+            )
+        )
+
+        response = client.get("/health/ready")
+
+        assert response.status_code == 503
+        data = response.json()
+        assert data["status"] == "not_ready"
+        assert data["checks"]["database"]["status"] == "unhealthy"
+
+    def test_readiness_includes_all_service_checks(
+        self, client: TestClient, mock_health_service: MagicMock
+    ) -> None:
+        """Test readiness response includes all expected service checks."""
+        response = client.get("/health/ready")
+        data = response.json()
+
+        checks = data["checks"]
+        assert "database" in checks
+        assert "redis" in checks
+        assert "temporal" in checks
+
+
+class TestRedisHealthEndpoint:
+    """Tests for the /health/redis endpoint."""
+
+    def test_redis_healthy(
+        self, client: TestClient, mock_health_service: MagicMock
+    ) -> None:
+        """Test Redis health check returns healthy status."""
+        response = client.get("/health/redis")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "healthy"
+        assert data["type"] == "redis"
+
+    def test_redis_unhealthy_returns_503(
+        self, client: TestClient, mock_health_service: MagicMock
+    ) -> None:
+        """Test Redis health check returns 503 when unhealthy."""
+        mock_health_service.check_redis_detailed = AsyncMock(
+            return_value=RedisHealthDetailed(
+                status=ServiceStatus.UNHEALTHY,
+                type="redis",
+                error="Connection refused",
+            )
+        )
+
+        response = client.get("/health/redis")
+
+        assert response.status_code == 503
+        data = response.json()
+        assert data["status"] == "unhealthy"
+
+    def test_redis_disabled(
+        self, client: TestClient, mock_health_service: MagicMock
+    ) -> None:
+        """Test Redis health check shows disabled status."""
+        mock_health_service.check_redis_detailed = AsyncMock(
+            return_value=RedisHealthDetailed(
+                status=ServiceStatus.DISABLED,
+                type="in-memory",
+                note="Redis is not enabled, using in-memory storage",
+            )
+        )
+
+        response = client.get("/health/redis")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "disabled"
+        assert data["type"] == "in-memory"
+
+
+class TestTemporalHealthEndpoint:
+    """Tests for the /health/temporal endpoint."""
+
+    def test_temporal_disabled(
+        self, client: TestClient, mock_health_service: MagicMock
+    ) -> None:
+        """Test Temporal health check shows disabled status."""
+        response = client.get("/health/temporal")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "disabled"
+
+    def test_temporal_healthy(
+        self, client: TestClient, mock_health_service: MagicMock
+    ) -> None:
+        """Test Temporal health check returns healthy when enabled."""
+        mock_health_service.check_temporal_detailed = AsyncMock(
+            return_value=TemporalHealthDetailed(
+                status=ServiceStatus.HEALTHY,
+                url="localhost:7233",
+                namespace="default",
+                task_queue="app",
+            )
+        )
+
+        response = client.get("/health/temporal")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "healthy"
+        assert data["url"] == "localhost:7233"
+
+    def test_temporal_unhealthy_returns_503(
+        self, client: TestClient, mock_health_service: MagicMock
+    ) -> None:
+        """Test Temporal health check returns 503 when unhealthy."""
+        mock_health_service.check_temporal_detailed = AsyncMock(
+            return_value=TemporalHealthDetailed(
+                status=ServiceStatus.UNHEALTHY,
+                error="Connection timeout",
+            )
+        )
+
+        response = client.get("/health/temporal")
+
+        assert response.status_code == 503
+        data = response.json()
+        assert data["status"] == "unhealthy"
+
+
+class TestServiceStatusEnum:
+    """Tests for the ServiceStatus enum values."""
+
+    def test_enum_values(self) -> None:
+        """Test ServiceStatus enum has expected values."""
+        assert ServiceStatus.HEALTHY.value == "healthy"
+        assert ServiceStatus.UNHEALTHY.value == "unhealthy"
+        assert ServiceStatus.DEGRADED.value == "degraded"
+        assert ServiceStatus.DISABLED.value == "disabled"
+
+    def test_enum_serialization(self) -> None:
+        """Test ServiceStatus serializes correctly in Pydantic models."""
+        health = DatabaseHealth(status=ServiceStatus.HEALTHY, type="sqlite")
+        data = health.model_dump()
+
+        # With use_enum_values=True, should serialize as string
+        assert data["status"] == "healthy"
diff --git a/tests/unit/app/core/services/test_health_service.py b/tests/unit/app/core/services/test_health_service.py
new file mode 100644
index 0000000..c3b6a47
--- /dev/null
+++ b/tests/unit/app/core/services/test_health_service.py
@@ -0,0 +1,378 @@
+"""Unit tests for the HealthCheckService.
+
+These tests verify the health check service logic with mocked dependencies.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from src.app.api.http.schemas.health import (
+    OverallStatus,
+    ServiceStatus,
+)
+from src.app.core.services.health_service import HealthCheckService
+
+if TYPE_CHECKING:
+    pass
+
+
+@pytest.fixture
+def mock_config() -> MagicMock:
+    """Create a mock configuration."""
+    config = MagicMock()
+    config.app.environment = "development"
+    config.database.url = "postgresql://localhost:5432/test"
+    config.redis.enabled = True
+    config.redis.url = "redis://localhost:6379"
+    config.temporal.enabled = True
+    config.oidc.providers = {}
+    return config
+
+
+@pytest.fixture
+def mock_app_deps() -> MagicMock:
+    """Create mock application dependencies."""
+    deps = MagicMock()
+
+    # Database service mocks
+    deps.database_service.health_check.return_value = True
+    deps.database_service.get_pool_status.return_value = {"size": 5, "in_use": 1}
+
+    # Redis service mocks
+    deps.redis_service = AsyncMock()
+    deps.redis_service.health_check = AsyncMock(return_value=True)
+    deps.redis_service.get_info = AsyncMock(
+        return_value={"version": "7.0.0", "connected_clients": 5}
+    )
+
+    # Temporal service mocks
+    deps.temporal_service = MagicMock()
+    deps.temporal_service.health_check = AsyncMock(return_value=True)
+    deps.temporal_service.url = "localhost:7233"
+    deps.temporal_service.namespace = "default"
+    deps.temporal_service.task_queue = "app"
+
+    # JWKS service for OIDC checks
+    deps.jwks_service = AsyncMock()
+
+    return deps
+
+
+@pytest.fixture
+def health_service(
+    mock_app_deps: MagicMock, mock_config: MagicMock
+) -> HealthCheckService:
+    """Create a HealthCheckService instance with mocked dependencies."""
+    return HealthCheckService(mock_app_deps, mock_config)
+
+
+class TestCheckAll:
+    """Tests for the check_all method."""
+
+    async def test_all_services_healthy(
+        self, health_service: HealthCheckService
+    ) -> None:
+        """Test check_all returns READY when all services are healthy."""
+        result = await health_service.check_all()
+
+        assert result.status == OverallStatus.READY
+        assert result.checks.database.status == ServiceStatus.HEALTHY
+        assert result.checks.redis.status == ServiceStatus.HEALTHY
+        assert result.checks.temporal.status == ServiceStatus.HEALTHY
+
+    async def test_database_failure_makes_not_ready(
+        self,
+        mock_app_deps: MagicMock,
+        mock_config: MagicMock,
+    ) -> None:
+        """Test check_all returns NOT_READY when database fails."""
+        mock_app_deps.database_service.health_check.return_value = False
+        service = HealthCheckService(mock_app_deps, mock_config)
+
+        result = await service.check_all()
+
+        assert result.status == OverallStatus.NOT_READY
+        assert result.checks.database.status == ServiceStatus.UNHEALTHY
+
+    async def test_redis_failure_still_ready(
+        self,
+        mock_app_deps: MagicMock,
+        mock_config: MagicMock,
+    ) -> None:
+        """Test check_all returns READY even when Redis fails (non-critical)."""
+        mock_app_deps.redis_service.health_check = AsyncMock(return_value=False)
+        service = HealthCheckService(mock_app_deps, mock_config)
+
+        result = await service.check_all()
+
+        # Redis is non-critical, so should still be READY
+        assert result.status == OverallStatus.READY
+        assert result.checks.redis.status == ServiceStatus.DEGRADED
+
+    async def test_temporal_failure_makes_not_ready(
+        self,
+        mock_app_deps: MagicMock,
+        mock_config: MagicMock,
+    ) -> None:
+        """Test check_all returns NOT_READY when Temporal fails (if enabled)."""
+        mock_app_deps.temporal_service.health_check = AsyncMock(return_value=False)
+        service = HealthCheckService(mock_app_deps, mock_config)
+
+        result = await service.check_all()
+
+        assert result.status == OverallStatus.NOT_READY
+        assert result.checks.temporal.status == ServiceStatus.UNHEALTHY
+
+
+class TestCheckDatabase:
+    """Tests for the check_database method."""
+
+    async def test_healthy_postgresql(self, health_service: HealthCheckService) -> None:
+        """Test database check returns healthy for PostgreSQL."""
+        result = await health_service.check_database()
+
+        assert result.status == ServiceStatus.HEALTHY
+        assert result.type == "postgresql"
+        assert result.error is None
+
+    async def test_healthy_sqlite(
+        self, mock_app_deps: MagicMock, mock_config: MagicMock
+    ) -> None:
+        """Test database check returns healthy for SQLite."""
+        mock_config.database.url = "sqlite:///test.db"
+        service = HealthCheckService(mock_app_deps, mock_config)
+
+        result = await service.check_database()
+
+        assert result.status == ServiceStatus.HEALTHY
+        assert result.type == "sqlite"
+
+    async def test_database_exception(
+        self, mock_app_deps: MagicMock, mock_config: MagicMock
+    ) -> None:
+        """Test database check returns unhealthy on exception."""
+        mock_app_deps.database_service.health_check.side_effect = Exception(
+            "Connection refused"
+        )
+        service = HealthCheckService(mock_app_deps, mock_config)
+
+        result = await service.check_database()
+
+        assert result.status == ServiceStatus.UNHEALTHY
+        assert result.error == "Connection refused"
+
+
+class TestCheckRedis:
+    """Tests for the check_redis method."""
+
+    async def test_redis_healthy(self, health_service: HealthCheckService) -> None:
+        """Test Redis check returns healthy when working."""
+        result = await health_service.check_redis()
+
+        assert result.status == ServiceStatus.HEALTHY
+        assert result.type == "redis"
+
+    async def test_redis_disabled(
+        self, mock_app_deps: MagicMock, mock_config: MagicMock
+    ) -> None:
+        """Test Redis check returns disabled when not enabled."""
+        mock_config.redis.enabled = False
+        service = HealthCheckService(mock_app_deps, mock_config)
+
+        result = await service.check_redis()
+
+        assert result.status == ServiceStatus.DISABLED
+        assert result.type == "in-memory"
+        assert "not enabled" in (result.note or "")
+
+    async def test_redis_service_not_initialized(
+        self, mock_app_deps: MagicMock, mock_config: MagicMock
+    ) -> None:
+        """Test Redis check returns degraded when service not initialized."""
+        mock_app_deps.redis_service = None
+        service = HealthCheckService(mock_app_deps, mock_config)
+
+        result = await service.check_redis()
+
+        assert result.status == ServiceStatus.DEGRADED
+        assert result.type == "in-memory"
+
+    async def test_redis_exception(
+        self, mock_app_deps: MagicMock, mock_config: MagicMock
+    ) -> None:
+        """Test Redis check returns degraded on exception."""
+        mock_app_deps.redis_service.health_check = AsyncMock(
+            side_effect=Exception("Connection timeout")
+        )
+        service = HealthCheckService(mock_app_deps, mock_config)
+
+        result = await service.check_redis()
+
+        assert result.status == ServiceStatus.DEGRADED
+        assert "Connection timeout" in (result.error or "")
+
+
+class TestCheckTemporal:
+    """Tests for the check_temporal method."""
+
+    async def test_temporal_healthy(self, health_service: HealthCheckService) -> None:
+        """Test Temporal check returns healthy when working."""
+        result = await health_service.check_temporal()
+
+        assert result.status == ServiceStatus.HEALTHY
+        assert result.url == "localhost:7233"
+        assert result.namespace == "default"
+
+    async def test_temporal_disabled(
+        self, mock_app_deps: MagicMock, mock_config: MagicMock
+    ) -> None:
+        """Test Temporal check returns disabled when not enabled."""
+        mock_config.temporal.enabled = False
+        service = HealthCheckService(mock_app_deps, mock_config)
+
+        result = await service.check_temporal()
+
+        assert result.status == ServiceStatus.DISABLED
+        assert "not enabled" in (result.note or "")
+
+    async def test_temporal_exception(
+        self, mock_app_deps: MagicMock, mock_config: MagicMock
+    ) -> None:
+        """Test Temporal check returns unhealthy on exception."""
+        mock_app_deps.temporal_service.health_check = AsyncMock(
+            side_effect=Exception("Service unavailable")
+        )
+        service = HealthCheckService(mock_app_deps, mock_config)
+
+        result = await service.check_temporal()
+
+        assert result.status == ServiceStatus.UNHEALTHY
+        assert "Service unavailable" in (result.error or "")
+
+
+class TestCheckTemporalDetailed:
+    """Tests for the check_temporal_detailed method."""
+
+    async def test_includes_task_queue(
+        self, health_service: HealthCheckService
+    ) -> None:
+        """Test detailed Temporal check includes task queue."""
+        result = await health_service.check_temporal_detailed()
+
+        assert result.status == ServiceStatus.HEALTHY
+        assert result.task_queue == "app"
+
+
+class TestCheckRedisDetailed:
+    """Tests for the check_redis_detailed method."""
+
+    async def test_includes_server_info(
+        self, health_service: HealthCheckService
+    ) -> None:
+        """Test detailed Redis check includes server info."""
+        result = await health_service.check_redis_detailed()
+
+        assert result.status == ServiceStatus.HEALTHY
+        assert result.info is not None
+        assert result.info.get("version") == "7.0.0"
+
+
+class TestCheckOIDCProviders:
+    """Tests for the check_oidc_providers method."""
+
+    async def test_no_providers_configured(
+        self, health_service: HealthCheckService
+    ) -> None:
+        """Test OIDC check returns None when no providers configured."""
+        result = await health_service.check_oidc_providers()
+
+        assert result is None
+
+    async def test_provider_healthy(
+        self, mock_app_deps: MagicMock, mock_config: MagicMock
+    ) -> None:
+        """Test OIDC check returns healthy for working provider."""
+        # Add a mock provider
+        mock_provider = MagicMock()
+        mock_provider.issuer = "https://accounts.google.com"
+        mock_config.oidc.providers = {"google": mock_provider}
+
+        mock_app_deps.jwks_service.fetch_jwks = AsyncMock(return_value={})
+
+        service = HealthCheckService(mock_app_deps, mock_config)
+        result = await service.check_oidc_providers()
+
+        assert result is not None
+        assert "google" in result
+        assert result["google"].status == ServiceStatus.HEALTHY
+        assert result["google"].issuer == "https://accounts.google.com"
+
+    async def test_provider_unhealthy(
+        self, mock_app_deps: MagicMock, mock_config: MagicMock
+    ) -> None:
+        """Test OIDC check returns unhealthy for failing provider."""
+        # Add a mock provider
+        mock_provider = MagicMock()
+        mock_provider.issuer = "https://accounts.google.com"
+        mock_config.oidc.providers = {"google": mock_provider}
+
+        mock_app_deps.jwks_service.fetch_jwks = AsyncMock(
+            side_effect=Exception("JWKS fetch failed")
+        )
+
+        service = HealthCheckService(mock_app_deps, mock_config)
+        result = await service.check_oidc_providers()
+
+        assert result is not None
+        assert "google" in result
+        assert result["google"].status == ServiceStatus.UNHEALTHY
+        assert "JWKS fetch failed" in (result["google"].error or "")
+
+
+class TestOverallHealthEvaluation:
+    """Tests for the _evaluate_overall_health method."""
+
+    async def test_oidc_failure_critical_in_production(
+        self, mock_app_deps: MagicMock, mock_config: MagicMock
+    ) -> None:
+        """Test OIDC failure makes app NOT_READY in production."""
+        mock_config.app.environment = "production"
+
+        # Add a failing OIDC provider
+        mock_provider = MagicMock()
+        mock_provider.issuer = "https://accounts.google.com"
+        mock_config.oidc.providers = {"google": mock_provider}
+
+        mock_app_deps.jwks_service.fetch_jwks = AsyncMock(
+            side_effect=Exception("JWKS fetch failed")
+        )
+
+        service = HealthCheckService(mock_app_deps, mock_config)
+        result = await service.check_all()
+
+        assert result.status == OverallStatus.NOT_READY
+
+    async def test_oidc_failure_not_critical_in_development(
+        self, mock_app_deps: MagicMock, mock_config: MagicMock
+    ) -> None:
+        """Test OIDC failure doesn't make app NOT_READY in development."""
+        mock_config.app.environment = "development"
+
+        # Add a failing OIDC provider
+        mock_provider = MagicMock()
+        mock_provider.issuer = "https://accounts.google.com"
+        mock_config.oidc.providers = {"google": mock_provider}
+
+        mock_app_deps.jwks_service.fetch_jwks = AsyncMock(
+            side_effect=Exception("JWKS fetch failed")
+        )
+
+        service = HealthCheckService(mock_app_deps, mock_config)
+        result = await service.check_all()
+
+        # In development, OIDC failures are not critical
+        assert result.status == OverallStatus.READY

From 7a6cee12706a76cc13e3705da457c1aeb20658aa Mon Sep 17 00:00:00 2001
From: jc <jc>
Date: Wed, 3 Dec 2025 22:06:59 -0500
Subject: [PATCH 3/5] checkpoint

---
 docs/fastapi-kubernetes-deployment.md         |  36 +-
 docs/fastapi-kubernetes-ingress.md            | 587 ++++++++++++++++++
 infra/helm/api-forge/templates/ingress.yaml   |  52 ++
 infra/helm/api-forge/values.yaml              |  84 ++-
 src/cli/deploy_commands.py                    |  18 +
 src/cli/deployment/helm_deployer/deployer.py  |  12 +-
 .../deployment/helm_deployer/helm_release.py  |  35 +-
 7 files changed, 790 insertions(+), 34 deletions(-)
 create mode 100644 docs/fastapi-kubernetes-ingress.md
 create mode 100644 infra/helm/api-forge/templates/ingress.yaml

diff --git a/docs/fastapi-kubernetes-deployment.md b/docs/fastapi-kubernetes-deployment.md
index 8d49f46..108770d 100644
--- a/docs/fastapi-kubernetes-deployment.md
+++ b/docs/fastapi-kubernetes-deployment.md
@@ -863,35 +863,18 @@ spec:
 
 ### Ingress
 
-Expose your FastAPI application via Ingress:
+API Forge includes built-in Ingress support via CLI flags. Enable external access with:
 
-```yaml
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: app-ingress
-  annotations:
-    cert-manager.io/cluster-issuer: letsencrypt-prod
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-spec:
-  ingressClassName: nginx
-  tls:
-    - hosts:
-        - api.example.com
-      secretName: app-tls
-  rules:
-    - host: api.example.com
-      http:
-        paths:
-          - path: /
-            pathType: Prefix
-            backend:
-              service:
-                name: app
-                port:
-                  number: 8000
+```bash
+# Basic ingress (HTTP)
+uv run api-forge-cli deploy up k8s --ingress
+
+# Custom hostname with TLS
+uv run api-forge-cli deploy up k8s --ingress --ingress-host api.example.com --ingress-tls-secret api-tls
 ```
 
+For comprehensive Ingress documentation including TLS setup, cloud provider configurations, and troubleshooting, see the **[Ingress Configuration Guide](./fastapi-kubernetes-ingress.md)**.
+
 ### NetworkPolicies
 
 Restrict pod-to-pod communication:
@@ -1350,6 +1333,7 @@ kubectl apply -f argocd-application.yaml
 
 ## Related Documentation
 
+- [Ingress Configuration](./fastapi-kubernetes-ingress.md) - External access, TLS, and routing
 - [Docker Dev Environment](./fastapi-docker-dev-environment.md) - Local testing before deployment
 - [Docker Compose Production](./fastapi-production-deployment-docker-compose.md) - Alternative deployment
 - [Testing Strategy](./fastapi-testing-strategy.md) - Test before deploying
diff --git a/docs/fastapi-kubernetes-ingress.md b/docs/fastapi-kubernetes-ingress.md
new file mode 100644
index 0000000..6caa455
--- /dev/null
+++ b/docs/fastapi-kubernetes-ingress.md
@@ -0,0 +1,587 @@
+# FastAPI Kubernetes Ingress Configuration
+
+Learn how to expose your FastAPI application externally using Kubernetes Ingress with API Forge. This guide covers Ingress setup, CLI usage, TLS configuration, and cloud provider integrations.
+
+## Overview
+
+API Forge includes built-in Ingress support for exposing your FastAPI application to external traffic. The Ingress resource provides:
+
+- **Host-based routing** - Route traffic to your app based on hostname
+- **TLS/HTTPS termination** - Secure connections with SSL certificates
+- **Path-based routing** - Route different paths to different services
+- **Load balancing** - Distribute traffic across pod replicas
+- **Cloud provider integration** - Works with AWS ALB, GCP GCLB, Azure, and NGINX
+
+> **Note:** Ingress is for the **app service only**. Internal services (PostgreSQL, Redis, Temporal) remain as ClusterIP services and are not exposed externally.
+
+## Quick Start
+
+Enable Ingress with the CLI:
+
+```bash
+# Enable ingress with default host (api.local)
+uv run api-forge-cli deploy up k8s --ingress
+
+# Custom hostname
+uv run api-forge-cli deploy up k8s --ingress --ingress-host api.example.com
+
+# With TLS (reference to a K8s TLS secret)
+uv run api-forge-cli deploy up k8s --ingress --ingress-host api.example.com --ingress-tls-secret api-tls
+```
+
+## Prerequisites
+
+Before enabling Ingress, ensure you have:
+
+### 1. Ingress Controller Installed
+
+An Ingress Controller must be installed in your cluster. The Ingress resource is just configuration - the controller does the actual routing.
+
+**Minikube (local development):**
+```bash
+minikube addons enable ingress
+
+# Verify the controller is running
+kubectl get pods -n ingress-nginx
+```
+
+**Production clusters (GKE, EKS, AKS):**
+```bash
+# Install NGINX Ingress Controller via Helm
+helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
+helm repo update
+
+helm install ingress-nginx ingress-nginx/ingress-nginx \
+  --namespace ingress-nginx \
+  --create-namespace
+
+# Wait for the controller to be ready
+kubectl wait --namespace ingress-nginx \
+  --for=condition=ready pod \
+  --selector=app.kubernetes.io/component=controller \
+  --timeout=120s
+
+# Verify installation
+kubectl get svc -n ingress-nginx
+```
+
+### 2. DNS Configuration (Production)
+
+For production deployments, configure DNS to point to your cluster:
+
+- **Minikube:** Add to `/etc/hosts`: `$(minikube ip) api.local`
+- **Cloud providers:** Create a DNS A/CNAME record pointing to the Load Balancer IP
+
+## CLI Reference
+
+The `deploy up k8s` command supports three Ingress-related flags:
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--ingress` | `false` | Enable Ingress for external access |
+| `--ingress-host` | `api.local` | Hostname for the Ingress rule |
+| `--ingress-tls-secret` | None | Kubernetes TLS secret name for HTTPS |
+
+### Examples
+
+**Basic Ingress (HTTP only):**
+```bash
+uv run api-forge-cli deploy up k8s --ingress
+# Access at: http://api.local (requires /etc/hosts entry)
+```
+
+**Custom hostname:**
+```bash
+uv run api-forge-cli deploy up k8s --ingress --ingress-host myapi.example.com
+# Access at: http://myapi.example.com
+```
+
+**HTTPS with existing TLS secret:**
+```bash
+uv run api-forge-cli deploy up k8s \
+  --ingress \
+  --ingress-host api.example.com \
+  --ingress-tls-secret api-tls
+# Access at: https://api.example.com
+```
+
+## How Ingress Routing Works
+
+Understanding the request flow helps troubleshoot issues:
+
+```
+Internet Request (https://api.example.com/docs)
+    │
+    ▼
+DNS Lookup
+    │ api.example.com → 34.123.45.67 (Load Balancer IP)
+    ▼
+Cloud Load Balancer
+    │ Provisioned by Ingress Controller
+    ▼
+Ingress Controller Pods (nginx)
+    │ 1. Reads HTTP Host header from request
+    │ 2. Matches against Ingress rules
+    │ 3. Finds: host=api.example.com → service=app:8000
+    ▼
+Kubernetes Service (app)
+    │ ClusterIP, port 8000
+    ▼
+FastAPI Pod(s)
+    │ Your application
+    ▼
+Response flows back through the chain
+```
+
+**Key Points:**
+
+1. The browser automatically sets the `Host` header from the URL
+2. NGINX Ingress matches this header against configured Ingress resources
+3. Traffic is proxied to the backend Kubernetes Service
+4. The Service load-balances across pod replicas
+
+## Configuration
+
+### Values.yaml Structure
+
+The Ingress configuration lives in `infra/helm/api-forge/values.yaml`:
+
+```yaml
+app:
+  ingress:
+    enabled: false              # Set to true or use --ingress flag
+    className: nginx            # Ingress controller class
+    annotations: {}             # Provider-specific annotations
+    hosts:
+      - host: api-forge.local
+        paths:
+          - path: /
+            pathType: Prefix
+    tls: []                     # TLS configuration
+```
+
+When using CLI flags, they override these values:
+
+```bash
+# CLI command:
+uv run api-forge-cli deploy up k8s --ingress --ingress-host api.example.com --ingress-tls-secret api-tls
+
+# Equivalent values.yaml:
+app:
+  ingress:
+    enabled: true
+    hosts:
+      - host: api.example.com
+        paths:
+          - path: /
+            pathType: Prefix
+    tls:
+      - secretName: api-tls
+        hosts:
+          - api.example.com
+```
+
+### Direct values.yaml Configuration
+
+For more complex setups, edit `values.yaml` directly:
+
+```yaml
+app:
+  ingress:
+    enabled: true
+    className: nginx
+    annotations:
+      nginx.ingress.kubernetes.io/ssl-redirect: "true"
+      nginx.ingress.kubernetes.io/proxy-body-size: "10m"
+    hosts:
+      - host: api.example.com
+        paths:
+          - path: /
+            pathType: Prefix
+          - path: /api/v2
+            pathType: Prefix
+    tls:
+      - secretName: api-tls
+        hosts:
+          - api.example.com
+```
+
+## TLS/HTTPS Configuration
+
+### Option 1: Manual TLS Secret
+
+Create a TLS secret from existing certificates:
+
+```bash
+# Create TLS secret from certificate files
+kubectl create secret tls api-tls \
+  --cert=path/to/tls.crt \
+  --key=path/to/tls.key \
+  -n api-forge-prod
+
+# Deploy with TLS
+uv run api-forge-cli deploy up k8s \
+  --ingress \
+  --ingress-host api.example.com \
+  --ingress-tls-secret api-tls
+```
+
+### Option 2: Cert-Manager (Recommended for Production)
+
+Use [cert-manager](https://cert-manager.io/) for automatic Let's Encrypt certificates:
+
+**1. Install cert-manager:**
+```bash
+helm repo add jetstack https://charts.jetstack.io
+helm repo update
+
+helm install cert-manager jetstack/cert-manager \
+  --namespace cert-manager \
+  --create-namespace \
+  --set installCRDs=true
+```
+
+**2. Create a ClusterIssuer:**
+```yaml
+# cluster-issuer.yaml
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata:
+  name: letsencrypt-prod
+spec:
+  acme:
+    email: your-email@example.com
+    server: https://acme-v02.api.letsencrypt.org/directory
+    privateKeySecretRef:
+      name: letsencrypt-prod-account-key
+    solvers:
+      - http01:
+          ingress:
+            class: nginx
+```
+
+```bash
+kubectl apply -f cluster-issuer.yaml
+```
+
+**3. Configure Ingress in values.yaml:**
+```yaml
+app:
+  ingress:
+    enabled: true
+    className: nginx
+    annotations:
+      cert-manager.io/cluster-issuer: letsencrypt-prod  # Triggers cert-manager
+    hosts:
+      - host: api.example.com
+        paths:
+          - path: /
+            pathType: Prefix
+    tls:
+      - secretName: api-tls          # cert-manager creates this automatically
+        hosts:
+          - api.example.com
+```
+
+**How it works:**
+1. Deploy the Ingress with the annotation and non-existent TLS secret
+2. Cert-manager detects the `cert-manager.io/cluster-issuer` annotation
+3. Cert-manager contacts Let's Encrypt, proves domain ownership via HTTP-01 challenge
+4. Cert-manager creates the `api-tls` secret with the certificate
+5. NGINX Ingress loads the certificate and serves HTTPS
+6. Cert-manager auto-renews before expiry
+
+## Cloud Provider Examples
+
+### AWS ALB Ingress
+
+Use AWS Application Load Balancer:
+
+```yaml
+app:
+  ingress:
+    enabled: true
+    className: alb
+    annotations:
+      alb.ingress.kubernetes.io/scheme: internet-facing
+      alb.ingress.kubernetes.io/target-type: ip
+      alb.ingress.kubernetes.io/certificate-arn: arn:aws:acm:us-east-1:123456789:certificate/abc-123
+      alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
+      alb.ingress.kubernetes.io/ssl-redirect: '443'
+    hosts:
+      - host: api.example.com
+        paths:
+          - path: /
+            pathType: Prefix
+    tls:
+      - hosts:
+          - api.example.com
+```
+
+**Prerequisites:**
+- AWS Load Balancer Controller installed
+- ACM certificate created for your domain
+
+### GKE with Google Cloud Load Balancer
+
+```yaml
+app:
+  ingress:
+    enabled: true
+    className: gce
+    annotations:
+      kubernetes.io/ingress.global-static-ip-name: api-ip
+      networking.gke.io/managed-certificates: api-cert
+    hosts:
+      - host: api.example.com
+        paths:
+          - path: /
+            pathType: Prefix
+```
+
+**Prerequisites:**
+- Reserve a global static IP: `gcloud compute addresses create api-ip --global`
+- Create a ManagedCertificate resource
+
+### Azure Application Gateway
+
+```yaml
+app:
+  ingress:
+    enabled: true
+    className: azure/application-gateway
+    annotations:
+      appgw.ingress.kubernetes.io/ssl-redirect: "true"
+      appgw.ingress.kubernetes.io/use-private-ip: "false"
+    hosts:
+      - host: api.example.com
+        paths:
+          - path: /
+            pathType: Prefix
+    tls:
+      - secretName: api-tls
+        hosts:
+          - api.example.com
+```
+
+## Minikube Local Development
+
+### Setup
+
+```bash
+# Enable ingress addon
+minikube addons enable ingress
+
+# Get Minikube IP
+minikube ip
+# Example output: 192.168.49.2
+
+# Add to /etc/hosts
+echo "$(minikube ip) api.local" | sudo tee -a /etc/hosts
+```
+
+### Deploy and Access
+
+```bash
+# Deploy with Ingress
+uv run api-forge-cli deploy up k8s --ingress
+
+# Access your API
+curl http://api.local/health
+open http://api.local/docs
+```
+
+### Minikube Tunnel (Alternative)
+
+If you have DNS issues, use `minikube tunnel`:
+
+```bash
+# In a separate terminal (requires sudo)
+minikube tunnel
+
+# This exposes LoadBalancer services on localhost
+# Access at http://localhost:8000 (if using LoadBalancer type)
+```
+
+## Why Use a Real Domain for Production
+
+For production deployments, use your actual domain (not `api.local`) because:
+
+| Feature | Why Real Domain Required |
+|---------|-------------------------|
+| **TLS/HTTPS** | Certificates are issued for specific domains. Mismatch causes TLS handshake failure. |
+| **OAuth/OIDC** | Redirect URIs registered with providers (Google, Microsoft) must match exactly. |
+| **Cookies** | Set with `domain=example.com`. Won't work if Host header differs. |
+| **CORS** | Configured for specific origins. Mismatched Host breaks cross-origin requests. |
+| **Browser Security** | Browsers set Host header automatically from URL - cannot be overridden. |
+
+**For local development:** `api.local` works because you control the hosts file and don't need real TLS/OAuth.
+
+## Service Types Comparison
+
+Understanding when to use Ingress vs other service types:
+
+| Type | Use Case | External Access | Example Services |
+|------|----------|----------------|------------------|
+| **ClusterIP** | Internal services only | No | PostgreSQL, Redis, Temporal |
+| **NodePort** | Dev/testing, direct node access | Yes (nodeIP:port) | Debugging only |
+| **LoadBalancer** | Single service exposure | Yes (external IP) | Simple apps without Ingress |
+| **Ingress** | Production APIs, TLS, routing | Yes (hostname) | FastAPI app |
+
+**Recommendation:** Use Ingress for the app service. Keep PostgreSQL, Redis, and Temporal as ClusterIP (internal only) for security.
+
+## Troubleshooting
+
+### Ingress Not Accessible
+
+**1. Check Ingress Controller is running:**
+```bash
+kubectl get pods -n ingress-nginx
+# Should show controller pods in Running state
+```
+
+**2. Check Ingress resource exists:**
+```bash
+kubectl get ingress -n api-forge-prod
+kubectl describe ingress app -n api-forge-prod
+```
+
+**3. Check Ingress has an address:**
+```bash
+kubectl get ingress -n api-forge-prod -o wide
+# ADDRESS column should show an IP or hostname
+```
+
+**4. Verify backend service exists:**
+```bash
+kubectl get svc app -n api-forge-prod
+kubectl get endpoints app -n api-forge-prod
+# Endpoints should list pod IPs
+```
+
+### 404 Not Found
+
+**Cause:** Host header doesn't match any Ingress rule.
+
+**Solution:**
+```bash
+# Check the configured host
+kubectl describe ingress app -n api-forge-prod | grep Host
+
+# Ensure you're using the correct hostname
+curl -H "Host: api.example.com" http://<ingress-ip>/health
+```
+
+### 502 Bad Gateway
+
+**Cause:** Backend pods not ready or service misconfiguration.
+
+**Solution:**
+```bash
+# Check pods are running
+kubectl get pods -n api-forge-prod -l app.kubernetes.io/name=app
+
+# Check pod logs
+kubectl logs -n api-forge-prod -l app.kubernetes.io/name=app
+
+# Verify service targets correct port
+kubectl describe svc app -n api-forge-prod
+```
+
+### TLS Certificate Issues
+
+**1. Check TLS secret exists:**
+```bash
+kubectl get secret api-tls -n api-forge-prod
+```
+
+**2. If using cert-manager, check certificate status:**
+```bash
+kubectl get certificate -n api-forge-prod
+kubectl describe certificate api-tls -n api-forge-prod
+```
+
+**3. Check cert-manager logs:**
+```bash
+kubectl logs -n cert-manager -l app.kubernetes.io/component=controller
+```
+
+### DNS Resolution Issues
+
+**1. Verify DNS resolves correctly:**
+```bash
+nslookup api.example.com
+dig api.example.com
+```
+
+**2. For local development, check /etc/hosts:**
+```bash
+cat /etc/hosts | grep api.local
+```
+
+**3. Test with IP directly:**
+```bash
+# Get the Ingress IP
+kubectl get ingress -n api-forge-prod -o jsonpath='{.items[0].status.loadBalancer.ingress[0].ip}'
+
+# Curl with Host header
+curl -H "Host: api.example.com" http://<ingress-ip>/health
+```
+
+### Ingress Controller Logs
+
+```bash
+# NGINX Ingress Controller logs
+kubectl logs -n ingress-nginx -l app.kubernetes.io/component=controller --tail=100
+
+# Look for errors related to your ingress
+kubectl logs -n ingress-nginx -l app.kubernetes.io/component=controller | grep api-forge
+```
+
+## Tips and Best Practices
+
+1. **Start with HTTP** - Get routing working before adding TLS
+2. **Use meaningful hostnames** - Match your actual domain structure
+3. **Enable SSL redirect** - Redirect HTTP to HTTPS in production
+4. **Set appropriate timeouts** - Configure proxy timeouts for long-running requests
+5. **Limit request size** - Use `proxy-body-size` annotation for file uploads
+6. **Monitor Ingress Controller** - Set up metrics and alerting
+7. **Use separate Ingress per environment** - Different hosts for staging vs production
+
+### Common Annotations
+
+```yaml
+annotations:
+  # SSL redirect (force HTTPS)
+  nginx.ingress.kubernetes.io/ssl-redirect: "true"
+  
+  # Request body size limit
+  nginx.ingress.kubernetes.io/proxy-body-size: "10m"
+  
+  # Timeouts for slow backends
+  nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
+  nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
+  
+  # WebSocket support
+  nginx.ingress.kubernetes.io/proxy-http-version: "1.1"
+  nginx.ingress.kubernetes.io/upstream-hash-by: "$http_upgrade"
+  
+  # Rate limiting
+  nginx.ingress.kubernetes.io/limit-rps: "10"
+  nginx.ingress.kubernetes.io/limit-connections: "5"
+```
+
+## Related Documentation
+
+- [Kubernetes Deployment Guide](./fastapi-kubernetes-deployment.md) - Complete K8s deployment guide
+- [Docker Dev Environment](./fastapi-docker-dev-environment.md) - Local development setup
+- [Secrets Management](./security/secrets_management.md) - Managing TLS secrets
+- [Production Deployment](./fastapi-production-deployment-docker-compose.md) - Docker Compose alternative
+
+## Additional Resources
+
+- [NGINX Ingress Controller](https://kubernetes.github.io/ingress-nginx/)
+- [cert-manager Documentation](https://cert-manager.io/docs/)
+- [AWS Load Balancer Controller](https://kubernetes-sigs.github.io/aws-load-balancer-controller/)
+- [GKE Ingress](https://cloud.google.com/kubernetes-engine/docs/concepts/ingress)
+- [Kubernetes Ingress Concepts](https://kubernetes.io/docs/concepts/services-networking/ingress/)
diff --git a/infra/helm/api-forge/templates/ingress.yaml b/infra/helm/api-forge/templates/ingress.yaml
new file mode 100644
index 0000000..27c70de
--- /dev/null
+++ b/infra/helm/api-forge/templates/ingress.yaml
@@ -0,0 +1,52 @@
+{{- if .Values.app.ingress.enabled -}}
+{{/*
+Ingress resource for exposing the API application externally.
+Supports:
+- Multiple hosts and paths
+- TLS termination
+- Ingress class selection (nginx, traefik, etc.)
+- Provider-specific annotations (AWS ALB, GCP, etc.)
+*/}}
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: {{ .Values.app.name | default "app" }}
+  namespace: {{ .Values.global.namespace }}
+  labels:
+    app.kubernetes.io/name: {{ .Values.app.name | default "app" }}
+    app.kubernetes.io/component: ingress
+    {{- include "api-forge.labels" . | nindent 4 }}
+  {{- with .Values.app.ingress.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  {{- if .Values.app.ingress.className }}
+  ingressClassName: {{ .Values.app.ingress.className }}
+  {{- end }}
+  {{- if .Values.app.ingress.tls }}
+  tls:
+    {{- range .Values.app.ingress.tls }}
+    - hosts:
+        {{- range .hosts }}
+        - {{ . | quote }}
+        {{- end }}
+      secretName: {{ .secretName }}
+    {{- end }}
+  {{- end }}
+  rules:
+    {{- range .Values.app.ingress.hosts }}
+    - host: {{ .host | quote }}
+      http:
+        paths:
+          {{- range .paths }}
+          - path: {{ .path }}
+            pathType: {{ .pathType | default "Prefix" }}
+            backend:
+              service:
+                name: {{ $.Values.app.name | default "app" }}
+                port:
+                  number: {{ $.Values.app.service.port | default 8000 }}
+          {{- end }}
+    {{- end }}
+{{- end }}
diff --git a/infra/helm/api-forge/values.yaml b/infra/helm/api-forge/values.yaml
index 191c967..9621158 100644
--- a/infra/helm/api-forge/values.yaml
+++ b/infra/helm/api-forge/values.yaml
@@ -217,16 +217,92 @@ app:
   service:
     type: ClusterIP
     port: 8000
+  # ---------------------------------------------------------------------------
+  # Ingress Configuration
+  # ---------------------------------------------------------------------------
+  # Exposes the API externally via an Ingress controller.
+  # 
+  # Prerequisites:
+  #   - An Ingress controller must be installed in the cluster
+  #   - For Minikube: minikube addons enable ingress
+  #   - For cloud: Use provider's ingress controller or install nginx-ingress
+  #
+  # Example configurations:
+  #
+  # Minikube (local development):
+  #   ingress:
+  #     enabled: true
+  #     className: nginx
+  #     hosts:
+  #       - host: api.local
+  #         paths:
+  #           - path: /
+  #             pathType: Prefix
+  #   # Add to /etc/hosts: $(minikube ip) api.local
+  #
+  # AWS ALB:
+  #   ingress:
+  #     enabled: true
+  #     className: alb
+  #     annotations:
+  #       alb.ingress.kubernetes.io/scheme: internet-facing
+  #       alb.ingress.kubernetes.io/target-type: ip
+  #       alb.ingress.kubernetes.io/certificate-arn: arn:aws:acm:...
+  #     hosts:
+  #       - host: api.example.com
+  #         paths:
+  #           - path: /
+  #             pathType: Prefix
+  #     tls:
+  #       - hosts:
+  #           - api.example.com
+  #
+  # GKE with Google Cloud Load Balancer:
+  #   ingress:
+  #     enabled: true
+  #     className: gce
+  #     annotations:
+  #       kubernetes.io/ingress.global-static-ip-name: api-ip
+  #       networking.gke.io/managed-certificates: api-cert
+  #     hosts:
+  #       - host: api.example.com
+  #         paths:
+  #           - path: /
+  #             pathType: Prefix
+  #
+  # NGINX with cert-manager TLS:
+  #   ingress:
+  #     enabled: true
+  #     className: nginx
+  #     annotations:
+  #       cert-manager.io/cluster-issuer: letsencrypt-prod
+  #       nginx.ingress.kubernetes.io/ssl-redirect: "true"
+  #     hosts:
+  #       - host: api.example.com
+  #         paths:
+  #           - path: /
+  #             pathType: Prefix
+  #     tls:
+  #       - secretName: api-tls
+  #         hosts:
+  #           - api.example.com
+  # ---------------------------------------------------------------------------
   ingress:
     enabled: false
     className: nginx
     annotations: {}
+      # nginx.ingress.kubernetes.io/ssl-redirect: "true"
+      # nginx.ingress.kubernetes.io/proxy-body-size: "10m"
+      # cert-manager.io/cluster-issuer: letsencrypt-prod
     hosts:
-    - host: api-forge.local
-      paths:
-      - path: /
-        pathType: Prefix
+      - host: api-forge.local
+        paths:
+          - path: /
+            pathType: Prefix
     tls: []
+      # - secretName: api-forge-tls
+      #   hosts:
+      #     - api-forge.local
   # Horizontal Pod Autoscaler
   autoscaling:
     enabled: false
diff --git a/src/cli/deploy_commands.py b/src/cli/deploy_commands.py
index 8608ba7..2a76d83 100644
--- a/src/cli/deploy_commands.py
+++ b/src/cli/deploy_commands.py
@@ -57,6 +57,21 @@ def up(
         "-r",
         help="Container registry for remote k8s clusters (e.g., ghcr.io/myuser)",
     ),
+    ingress: bool = typer.Option(
+        False,
+        "--ingress/--no-ingress",
+        help="Enable Ingress for external access (k8s only)",
+    ),
+    ingress_host: str = typer.Option(
+        None,
+        "--ingress-host",
+        help="Ingress hostname (k8s only, e.g., api.example.com)",
+    ),
+    ingress_tls_secret: str = typer.Option(
+        None,
+        "--ingress-tls-secret",
+        help="TLS secret name for HTTPS (k8s only)",
+    ),
 ) -> None:
     """
     🚀 Deploy the application to the specified environment.
@@ -101,6 +116,9 @@ def up(
                 no_wait=no_wait,
                 force_recreate=force_recreate,
                 registry=registry,
+                ingress_enabled=ingress,
+                ingress_host=ingress_host,
+                ingress_tls_secret=ingress_tls_secret,
             )
 
     except DeploymentError as e:
diff --git a/src/cli/deployment/helm_deployer/deployer.py b/src/cli/deployment/helm_deployer/deployer.py
index 879826d..c35f574 100644
--- a/src/cli/deployment/helm_deployer/deployer.py
+++ b/src/cli/deployment/helm_deployer/deployer.py
@@ -164,6 +164,9 @@ def deploy(
         self,
         namespace: str | None = None,
         registry: str | None = None,
+        ingress_enabled: bool = False,
+        ingress_host: str | None = None,
+        ingress_tls_secret: str | None = None,
         **kwargs: Any,
     ) -> None:
         """Deploy to Kubernetes cluster.
@@ -182,6 +185,9 @@ def deploy(
         Args:
             namespace: Kubernetes namespace (default: api-forge-prod)
             registry: Container registry for remote clusters
+            ingress_enabled: Whether to enable Ingress for external access
+            ingress_host: Hostname for Ingress (e.g., api.example.com)
+            ingress_tls_secret: TLS secret name for HTTPS
             **kwargs: Reserved for future options
         """
         if not self.check_env_file():
@@ -244,7 +250,11 @@ def deploy(
 
         # Phase 4: Deploy via Helm
         image_override_file = self.helm_release.create_image_override_file(
-            image_tag, registry
+            image_tag,
+            registry,
+            ingress_enabled=ingress_enabled,
+            ingress_host=ingress_host,
+            ingress_tls_secret=ingress_tls_secret,
         )
         self.helm_release.deploy_release(namespace, image_override_file)
 
diff --git a/src/cli/deployment/helm_deployer/helm_release.py b/src/cli/deployment/helm_deployer/helm_release.py
index 76f2420..5a8e9b3 100644
--- a/src/cli/deployment/helm_deployer/helm_release.py
+++ b/src/cli/deployment/helm_deployer/helm_release.py
@@ -8,7 +8,7 @@
 
 import tempfile
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import yaml  # type: ignore[import-untyped]
 
@@ -55,12 +55,18 @@ def create_image_override_file(
         self,
         image_tag: str,
         registry: str | None = None,
+        ingress_enabled: bool = False,
+        ingress_host: str | None = None,
+        ingress_tls_secret: str | None = None,
     ) -> Path:
-        """Create a temporary values file to override image tags.
+        """Create a temporary values file to override image tags and ingress.
 
         Args:
             image_tag: The unique image tag to use for all images
             registry: Optional container registry prefix for remote clusters
+            ingress_enabled: Whether to enable Ingress for external access
+            ingress_host: Hostname for Ingress (e.g., api.example.com)
+            ingress_tls_secret: TLS secret name for HTTPS
 
         Returns:
             Path to the temporary override file
@@ -77,7 +83,7 @@ def create_image_override_file(
             redis_repo = self.constants.REDIS_IMAGE_NAME
             temporal_repo = self.constants.TEMPORAL_IMAGE_NAME
 
-        override_values = {
+        override_values: dict[str, Any] = {
             "app": {"image": {"repository": app_repo, "tag": image_tag}},
             "worker": {"image": {"repository": app_repo, "tag": image_tag}},
             # Use content-based tags for infra images to avoid stale image issues
@@ -86,6 +92,29 @@ def create_image_override_file(
             "temporal": {"image": {"repository": temporal_repo, "tag": image_tag}},
         }
 
+        # Add ingress configuration if enabled
+        if ingress_enabled:
+            ingress_config: dict[str, Any] = {"enabled": True}
+
+            # Set hostname if provided
+            host = ingress_host or "api.local"
+            ingress_config["hosts"] = [
+                {"host": host, "paths": [{"path": "/", "pathType": "Prefix"}]}
+            ]
+
+            # Add TLS configuration if secret is provided
+            if ingress_tls_secret:
+                ingress_config["tls"] = [
+                    {"secretName": ingress_tls_secret, "hosts": [host]}
+                ]
+
+            override_values["app"]["ingress"] = ingress_config
+
+            self.console.print(
+                f"[bold cyan]🌐 Ingress enabled:[/bold cyan] {host}"
+                + (f" (TLS: {ingress_tls_secret})" if ingress_tls_secret else "")
+            )
+
         temp_file = Path(tempfile.mktemp(suffix=".yaml", prefix="helm-image-override-"))
         with open(temp_file, "w") as f:
             yaml.dump(override_values, f, default_flow_style=False)

From a22c701570e13478334a4e974a4134c1880c2a04 Mon Sep 17 00:00:00 2001
From: jc <jc>
Date: Wed, 3 Dec 2025 22:52:22 -0500
Subject: [PATCH 4/5] Adding ingress docs

---
 docs/fastapi-kubernetes-ingress.md | 146 ++++++++++++++++++++++++++---
 1 file changed, 135 insertions(+), 11 deletions(-)

diff --git a/docs/fastapi-kubernetes-ingress.md b/docs/fastapi-kubernetes-ingress.md
index 6caa455..d5f1497 100644
--- a/docs/fastapi-kubernetes-ingress.md
+++ b/docs/fastapi-kubernetes-ingress.md
@@ -228,9 +228,53 @@ uv run api-forge-cli deploy up k8s \
 
 ### Option 2: Cert-Manager (Recommended for Production)
 
-Use [cert-manager](https://cert-manager.io/) for automatic Let's Encrypt certificates:
-
-**1. Install cert-manager:**
+Use [cert-manager](https://cert-manager.io/) for automatic Let's Encrypt certificates.
+
+> **Note:** The existing self-signed certificate mechanism (used for PostgreSQL, Redis) won't work for Ingress TLS. Browsers and external clients don't trust the internal CA, OAuth providers reject self-signed certificates, and there's no way to install the CA on all public clients. Use cert-manager with Let's Encrypt for production, or skip TLS for local development.
+
+#### Understanding Cert-Manager Architecture
+
+Cert-manager consists of **two parts**:
+
+1. **The Deployment** (pods that do the work) - Installed via Helm
+2. **ClusterIssuer** (configuration) - Tells cert-manager HOW to obtain certificates
+
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│  cert-manager namespace                                             │
+│  ┌─────────────────────┐                                            │
+│  │ cert-manager pod    │◄─── The actual controller (deployment)     │
+│  │ (watches Ingresses) │                                            │
+│  └──────────┬──────────┘                                            │
+│             │                                                       │
+│             │ Reads configuration from:                             │
+│             ▼                                                       │
+│  ┌─────────────────────────────────────────────────────────────┐   │
+│  │ ClusterIssuer: letsencrypt-prod (cluster-scoped config)     │   │
+│  │   - ACME server: Let's Encrypt                              │   │
+│  │   - Solver: HTTP-01 via nginx ingress                       │   │
+│  │   - Account key: stored in Secret                           │   │
+│  └─────────────────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────────────┘
+                              │
+                              │ When Ingress has annotation:
+                              │ cert-manager.io/cluster-issuer: letsencrypt-prod
+                              ▼
+┌─────────────────────────────────────────────────────────────────────┐
+│  api-forge-prod namespace                                           │
+│  ┌─────────────────────┐         ┌─────────────────────┐           │
+│  │ Ingress: app        │────────▶│ Secret: api-tls     │           │
+│  │ (your config)       │         │ (created by         │           │
+│  │                     │         │  cert-manager)      │           │
+│  └─────────────────────┘         └─────────────────────┘           │
+└─────────────────────────────────────────────────────────────────────┘
+```
+
+The `ClusterIssuer` is **not a deployment** - it's a Custom Resource (configuration object) that cert-manager reads. You create it once per cluster, and any Ingress in any namespace can reference it.
+
+#### Step-by-Step Setup
+
+**1. Install cert-manager (the deployment):**
 ```bash
 helm repo add jetstack https://charts.jetstack.io
 helm repo update
@@ -239,21 +283,38 @@ helm install cert-manager jetstack/cert-manager \
   --namespace cert-manager \
   --create-namespace \
   --set installCRDs=true
+
+# Wait for pods to be ready
+kubectl wait --namespace cert-manager \
+  --for=condition=ready pod \
+  --selector=app.kubernetes.io/instance=cert-manager \
+  --timeout=120s
 ```
 
-**2. Create a ClusterIssuer:**
+This creates:
+- `cert-manager` pod - the controller that obtains certificates
+- `cert-manager-webhook` pod - validates custom resources
+- `cert-manager-cainjector` pod - injects CA bundles
+- Custom Resource Definitions (CRDs) for `ClusterIssuer`, `Certificate`, etc.
+
+**2. Create a ClusterIssuer (configuration only - no new pods):**
 ```yaml
 # cluster-issuer.yaml
 apiVersion: cert-manager.io/v1
 kind: ClusterIssuer
 metadata:
   name: letsencrypt-prod
+  # No namespace - ClusterIssuer is cluster-scoped
 spec:
   acme:
+    # Email for Let's Encrypt expiry notifications
     email: your-email@example.com
+    # Production ACME server
     server: https://acme-v02.api.letsencrypt.org/directory
+    # Secret to store your ACME account private key (created automatically)
     privateKeySecretRef:
       name: letsencrypt-prod-account-key
+    # How to prove domain ownership
     solvers:
       - http01:
           ingress:
@@ -262,8 +323,15 @@ spec:
 
 ```bash
 kubectl apply -f cluster-issuer.yaml
+
+# Verify it's ready
+kubectl get clusterissuer letsencrypt-prod
+# NAME               READY   AGE
+# letsencrypt-prod   True    30s
 ```
 
+> **About `privateKeySecretRef`:** The `letsencrypt-prod-account-key` secret is created automatically by cert-manager on first use. It stores your Let's Encrypt account private key - you don't create it manually.
+
 **3. Configure Ingress in values.yaml:**
 ```yaml
 app:
@@ -283,13 +351,69 @@ app:
           - api.example.com
 ```
 
-**How it works:**
-1. Deploy the Ingress with the annotation and non-existent TLS secret
-2. Cert-manager detects the `cert-manager.io/cluster-issuer` annotation
-3. Cert-manager contacts Let's Encrypt, proves domain ownership via HTTP-01 challenge
-4. Cert-manager creates the `api-tls` secret with the certificate
-5. NGINX Ingress loads the certificate and serves HTTPS
-6. Cert-manager auto-renews before expiry
+#### Certificate Issuance Flow
+
+When you deploy with the above configuration:
+
+1. **Ingress created** with `cert-manager.io/cluster-issuer` annotation
+2. **Cert-manager detects** the annotation (it watches all Ingresses)
+3. **Cert-manager creates** a temporary Ingress for the HTTP-01 challenge
+4. **Let's Encrypt calls** `http://api.example.com/.well-known/acme-challenge/<token>`
+5. **Cert-manager responds** with proof of domain ownership
+6. **Let's Encrypt issues** the certificate
+7. **Cert-manager stores** it in the `api-tls` Secret
+8. **NGINX Ingress loads** the certificate and serves HTTPS
+9. **Cert-manager auto-renews** before expiry (default: 30 days before)
+
+#### Verify Certificate Status
+
+```bash
+# Check ClusterIssuer is ready
+kubectl get clusterissuer letsencrypt-prod
+
+# Check Certificate resource (created automatically from Ingress)
+kubectl get certificate -n api-forge-prod
+kubectl describe certificate -n api-forge-prod
+
+# Check the TLS secret was created
+kubectl get secret api-tls -n api-forge-prod
+
+# If issues, check cert-manager logs
+kubectl logs -n cert-manager -l app.kubernetes.io/component=controller --tail=100
+```
+
+#### ClusterIssuer vs Issuer
+
+| Type | Scope | Use Case |
+|------|-------|----------|
+| `ClusterIssuer` | Cluster-wide | One issuer, all namespaces can use it |
+| `Issuer` | Single namespace | Per-namespace control, multi-tenant clusters |
+
+For most setups, `ClusterIssuer` is simpler - create once, reference from any namespace.
+
+#### Staging vs Production
+
+Let's Encrypt has [rate limits](https://letsencrypt.org/docs/rate-limits/). Use staging for testing:
+
+```yaml
+# cluster-issuer-staging.yaml
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata:
+  name: letsencrypt-staging
+spec:
+  acme:
+    email: your-email@example.com
+    server: https://acme-staging-v02.api.letsencrypt.org/directory  # Staging server
+    privateKeySecretRef:
+      name: letsencrypt-staging-account-key
+    solvers:
+      - http01:
+          ingress:
+            class: nginx
+```
+
+Staging certificates are **not trusted by browsers** but let you test the full flow without hitting rate limits.
 
 ## Cloud Provider Examples
 

From e26d5f4a210f54aca855dd73be093b4c7496597b Mon Sep 17 00:00:00 2001
From: jc <jc>
Date: Wed, 3 Dec 2025 23:16:39 -0500
Subject: [PATCH 5/5] checkpoint

---
 docs/fastapi-flyio-kubernetes.md | 309 +++++++++++++++++++++++++++++++
 1 file changed, 309 insertions(+)
 create mode 100644 docs/fastapi-flyio-kubernetes.md

diff --git a/docs/fastapi-flyio-kubernetes.md b/docs/fastapi-flyio-kubernetes.md
new file mode 100644
index 0000000..c42d181
--- /dev/null
+++ b/docs/fastapi-flyio-kubernetes.md
@@ -0,0 +1,309 @@
+# FastAPI Fly.io Kubernetes (FKS) Deployment
+
+This document covers deploying API Forge to Fly.io's Kubernetes Service (FKS), including compatibility analysis with our existing Helm-based deployment and future CLI design considerations.
+
+## Overview
+
+Fly.io Kubernetes Service (FKS) is Fly.io's managed Kubernetes offering that integrates with their global Anycast network. It provides a different deployment model than traditional Kubernetes clusters (GKE, EKS, AKS), with some unique advantages and constraints.
+
+> **Status:** FKS support is planned but not yet implemented. This document captures research and design decisions for future development.
+
+## FKS Key Characteristics
+
+### How FKS Differs from Standard Kubernetes
+
+| Aspect | Standard K8s (GKE, EKS, AKS) | Fly.io FKS |
+|--------|------------------------------|------------|
+| **Ingress** | Ingress resource + Ingress Controller (nginx) | Not used - Fly's Anycast proxy handles routing |
+| **LoadBalancer** | Cloud provider provisions external IP | Maps directly to Fly.io's edge network |
+| **TLS Certificates** | cert-manager + Let's Encrypt | Automatic - Fly provisions and renews certs |
+| **DNS** | Manual - point domain to LB IP | Automatic `*.fly.dev` + custom domain support |
+| **Global Distribution** | Multi-region requires complex setup | Built-in - deploy to 30+ regions easily |
+| **Scaling** | HPA, node autoscaling | Fly Machines with automatic scaling |
+
+### What FKS Provides Automatically
+
+1. **Automatic TLS** - No cert-manager, no Let's Encrypt configuration needed
+2. **Global Anycast** - Traffic routed to nearest region automatically
+3. **DDoS Protection** - Built into Fly's edge network
+4. **Automatic DNS** - `yourapp.fly.dev` domains provisioned automatically
+5. **Custom Domains** - Simple CNAME setup with automatic certificate issuance
+
+### What FKS Does NOT Use
+
+- ❌ Ingress resources
+- ❌ Ingress Controllers (nginx, traefik, etc.)
+- ❌ cert-manager
+- ❌ External DNS controllers
+- ❌ Cloud provider load balancer integrations
+
+## Exposing Services on FKS
+
+### LoadBalancer Service (Recommended)
+
+On FKS, you expose services using `type: LoadBalancer` which Fly.io intercepts:
+
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: app
+  annotations:
+    # Fly-specific annotations
+    fly.io/app: my-fastapi-app
+spec:
+  type: LoadBalancer
+  ports:
+    - port: 443
+      targetPort: 8000
+  selector:
+    app.kubernetes.io/name: app
+```
+
+Fly.io then:
+1. Creates a Fly App if it doesn't exist
+2. Provisions `my-fastapi-app.fly.dev` domain
+3. Issues TLS certificate automatically
+4. Routes global traffic through Anycast
+
+### Custom Domains
+
+```yaml
+metadata:
+  annotations:
+    fly.io/app: my-fastapi-app
+    fly.io/domains: "api.example.com,api.mycompany.io"
+```
+
+Then add a CNAME record:
+```
+api.example.com CNAME my-fastapi-app.fly.dev
+```
+
+Fly automatically issues certificates for custom domains.
+
+## Compatibility Analysis
+
+### Current API Forge Features vs FKS
+
+| API Forge Feature | Standard K8s | FKS Compatibility |
+|-------------------|--------------|-------------------|
+| Helm chart deployment | ✅ Works | ✅ Works (FKS is standard K8s) |
+| PostgreSQL StatefulSet | ✅ Works | ⚠️ Consider Fly Postgres instead |
+| Redis Deployment | ✅ Works | ⚠️ Consider Fly Redis (Upstash) |
+| Temporal | ✅ Works | ✅ Works |
+| `--ingress` flag | Creates Ingress | ❌ Should create LoadBalancer instead |
+| `--ingress-host` | Sets Ingress host | Should set Fly app name |
+| `--ingress-tls-secret` | References K8s Secret | ❌ Not needed |
+| `--ingress-tls auto` | Uses cert-manager | ❌ Not needed |
+| NetworkPolicies | ✅ Works | ✅ Works |
+| PersistentVolumeClaims | ✅ Works | ✅ Works (Fly Volumes) |
+
+### Components That Need Adaptation
+
+1. **Ingress → LoadBalancer Service**
+   - Replace Ingress resource with LoadBalancer Service
+   - Add Fly-specific annotations
+   - Remove Ingress Controller dependency
+
+2. **TLS Configuration**
+   - Remove cert-manager setup
+   - Remove TLS secret references
+   - Fly handles all certificate management
+
+3. **Database Considerations**
+   - Could use in-cluster PostgreSQL (works but not recommended)
+   - Better: Use Fly Postgres (managed, with replicas)
+   - Fly Postgres uses their own clustering solution
+
+4. **Redis Considerations**
+   - Could use in-cluster Redis (works)
+   - Alternative: Upstash Redis (Fly partnership, serverless)
+
+## Proposed CLI Design
+
+### Option A: Unified Command with Detection (Complex)
+
+```bash
+# CLI detects cluster type and adapts
+uv run api-forge-cli deploy up k8s --ingress --ingress-host myapp
+
+# On FKS: Creates LoadBalancer Service with Fly annotations
+# On standard K8s: Creates Ingress + optional cert-manager
+```
+
+**Pros:** Single command, automatic adaptation
+**Cons:** Complex logic, harder to debug, surprises users
+
+### Option B: Separate Target (Recommended)
+
+```bash
+# Explicit Fly.io target
+uv run api-forge-cli deploy up fly --app myapp --region ord
+
+# Standard Kubernetes
+uv run api-forge-cli deploy up k8s --ingress --ingress-host api.example.com
+```
+
+**Pros:** Clear intent, simpler implementation, Fly-specific optimizations
+**Cons:** Another target to maintain
+
+### Recommended: Option B
+
+Separate targets are better because:
+
+1. **Fly has unique features** - Machines, regions, Fly Postgres are Fly-specific
+2. **Simpler Helm chart** - No conditionals for "is this FKS?"
+3. **Better UX** - Users explicitly choose their target
+4. **Fly CLI integration** - Can leverage `flyctl` where appropriate
+5. **Different defaults** - FKS might skip in-cluster Postgres entirely
+
+### Proposed CLI Commands
+
+```bash
+# Setup Fly.io (one-time)
+uv run api-forge-cli deploy setup fly
+# - Verifies flyctl is installed
+# - Authenticates with Fly.io
+# - Creates Fly organization if needed
+
+# Deploy to Fly Kubernetes
+uv run api-forge-cli deploy up fly \
+  --app my-fastapi-app \
+  --region ord \
+  --postgres fly           # Use Fly Postgres (recommended)
+  --redis upstash          # Use Upstash Redis (optional)
+
+# Or with in-cluster databases (not recommended for production)
+uv run api-forge-cli deploy up fly \
+  --app my-fastapi-app \
+  --postgres in-cluster \
+  --redis in-cluster
+
+# Status
+uv run api-forge-cli deploy status fly --app my-fastapi-app
+
+# Teardown
+uv run api-forge-cli deploy down fly --app my-fastapi-app
+```
+
+### Implementation Phases
+
+**Phase 1: Basic FKS Support**
+- Deploy app and worker to FKS
+- Use LoadBalancer Service for external access
+- In-cluster PostgreSQL and Redis (same as standard K8s)
+
+**Phase 2: Fly-Native Services**
+- Fly Postgres integration
+- Upstash Redis integration
+- Fly Volumes for persistent storage
+
+**Phase 3: Advanced Features**
+- Multi-region deployment
+- Fly Machines autoscaling
+- Fly.io metrics integration
+
+## Helm Chart Modifications for FKS
+
+### Conditional Ingress vs LoadBalancer
+
+```yaml
+# values.yaml
+app:
+  # Standard K8s ingress (existing)
+  ingress:
+    enabled: false
+    # ... existing config
+  
+  # Fly.io specific (new)
+  fly:
+    enabled: false
+    app: ""
+    regions: ["ord"]
+    domains: []
+```
+
+### FKS-Specific Service Template
+
+```yaml
+# templates/services/app-fly.yaml
+{{- if .Values.app.fly.enabled }}
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ .Values.app.name | default "app" }}
+  namespace: {{ .Values.global.namespace }}
+  annotations:
+    fly.io/app: {{ .Values.app.fly.app | required "app.fly.app is required" }}
+    {{- if .Values.app.fly.domains }}
+    fly.io/domains: {{ .Values.app.fly.domains | join "," | quote }}
+    {{- end }}
+  labels:
+    {{- include "api-forge.labels" . | nindent 4 }}
+spec:
+  type: LoadBalancer
+  ports:
+    - port: 443
+      targetPort: {{ .Values.app.service.port | default 8000 }}
+      name: https
+  selector:
+    app.kubernetes.io/name: {{ .Values.app.name | default "app" }}
+{{- end }}
+```
+
+## TLS Strategy by Platform
+
+| Platform | TLS Strategy | CLI Flag |
+|----------|--------------|----------|
+| **Minikube** | None (HTTP) or self-signed | `--ingress` (no TLS) |
+| **Standard K8s** | cert-manager + Let's Encrypt | `--ingress --ingress-tls auto` |
+| **Standard K8s** | Manual certificate | `--ingress --ingress-tls-secret name` |
+| **AWS EKS** | ACM certificate | `--ingress` + ACM annotation |
+| **GKE** | Google-managed cert | `--ingress` + ManagedCertificate |
+| **Fly.io FKS** | Automatic (Fly-managed) | `deploy up fly` (TLS automatic) |
+
+## Database Strategy by Platform
+
+| Platform | Recommended PostgreSQL | Recommended Redis |
+|----------|----------------------|-------------------|
+| **Development** | Docker Compose (local) | Docker Compose (local) |
+| **Minikube** | In-cluster StatefulSet | In-cluster Deployment |
+| **Standard K8s** | In-cluster or managed (RDS, Cloud SQL) | In-cluster or managed |
+| **Fly.io FKS** | Fly Postgres (managed) | Upstash Redis or in-cluster |
+
+## Migration Path
+
+### From Docker Compose to FKS
+
+1. **Test locally** with `deploy up dev`
+2. **Test on Minikube** with `deploy up k8s`
+3. **Deploy to FKS** with `deploy up fly`
+
+### From Standard K8s to FKS
+
+1. **Export data** from existing PostgreSQL
+2. **Create Fly Postgres** cluster
+3. **Import data** to Fly Postgres
+4. **Deploy app** to FKS with `--postgres fly`
+5. **Update DNS** to point to Fly
+
+## Current Limitations
+
+1. **FKS is relatively new** - Some features may change
+2. **Fly Postgres clustering** - Different from standard PostgreSQL HA
+3. **Temporal on Fly** - May need special consideration for workflows
+4. **Cost model** - Fly charges differently than traditional cloud
+
+## Related Documentation
+
+- [Kubernetes Deployment Guide](./fastapi-kubernetes-deployment.md) - Standard K8s deployment
+- [Ingress Configuration](./fastapi-kubernetes-ingress.md) - Ingress and TLS for standard K8s
+- [Docker Dev Environment](./fastapi-docker-dev-environment.md) - Local development
+
+## External Resources
+
+- [Fly.io Kubernetes Documentation](https://fly.io/docs/kubernetes/)
+- [Fly.io FKS Quickstart](https://fly.io/docs/kubernetes/fks-quickstart/)
+- [Fly Postgres](https://fly.io/docs/postgres/)
+- [Upstash Redis on Fly](https://fly.io/docs/reference/redis/)