diff --git a/infra/helm/cert-manager/letsencrypt-staging.yaml b/infra/helm/cert-manager/letsencrypt-staging.yaml
new file mode 100644
index 0000000..2fa6a77
--- /dev/null
+++ b/infra/helm/cert-manager/letsencrypt-staging.yaml
@@ -0,0 +1,24 @@
+# ClusterIssuer for Let's Encrypt TLS certificates
+# Generated by: uv run api-forge-cli k8s setup-tls --email pieware@gmail.com
+# This is a cluster-scoped resource (not namespaced).
+# Apply with: kubectl apply -f infra/helm/cert-manager/letsencrypt-staging.yaml
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata:
+  name: letsencrypt-staging
+  labels:
+    app.kubernetes.io/managed-by: api-forge-cli
+spec:
+  acme:
+    # Let's Encrypt ACME server
+    server: https://acme-staging-v02.api.letsencrypt.org/directory
+    # Email for certificate expiration notifications
+    email: pieware@gmail.com
+    # Secret to store the ACME account private key
+    privateKeySecretRef:
+      name: letsencrypt-staging-account-key
+    # HTTP-01 challenge solver using NGINX ingress
+    solvers:
+    - http01:
+        ingress:
+          class: nginx
diff --git a/pyproject.toml b/pyproject.toml
index b6f7748..a802980 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,6 +29,7 @@ dependencies = [
     "temporalio>=1.18.1",
     "requests>=2.32.5",
     "ruamel.yaml>=0.18.6",
+    "kr8s>=0.20.14",
 ]
 
 [build-system]
diff --git a/src/cli/__init__.py b/src/cli/__init__.py
index 6be2076..b1cd3d1 100644
--- a/src/cli/__init__.py
+++ b/src/cli/__init__.py
@@ -1,10 +1,30 @@
-"""Main CLI application module."""
+"""Main CLI application module.
+
+This module provides the main entry point for the API Forge CLI.
+Commands are organized by deployment target (dev, prod, k8s, fly)
+rather than by operation type (up, down, status).
+
+Command Groups:
+- dev: Development Docker Compose environment
+- prod: Production Docker Compose deployment
+- k8s: Kubernetes Helm deployment
+- fly: Fly.io Kubernetes (coming soon)
+- entity: Entity/model scaffolding
+- secrets: Secret management
+- users: Keycloak user management (dev)
+"""
 
 import typer
 
-from .deploy_commands import deploy_app
-from .entity_commands import entity_app
-from .secrets_commands import secrets_app
+from .commands import (
+    dev_app,
+    entity_app,
+    fly_app,
+    k8s_app,
+    prod_app,
+    secrets_app,
+    users_app,
+)
 
 # Create the main CLI application
 app = typer.Typer(
@@ -13,10 +33,16 @@
     rich_markup_mode="rich",
 )
 
-# Register command groups
-app.add_typer(deploy_app, name="deploy")
+# Register deployment target command groups
+app.add_typer(dev_app, name="dev", help="Development environment commands")
+app.add_typer(prod_app, name="prod", help="Production Docker Compose commands")
+app.add_typer(k8s_app, name="k8s", help="Kubernetes Helm deployment commands")
+app.add_typer(fly_app, name="fly", help="Fly.io Kubernetes commands (coming soon)")
+
+# Register utility command groups
 app.add_typer(entity_app, name="entity")
 app.add_typer(secrets_app, name="secrets")
+app.add_typer(users_app, name="users")
 
 
 def main() -> None:
diff --git a/src/cli/commands/__init__.py b/src/cli/commands/__init__.py
new file mode 100644
index 0000000..8d79da5
--- /dev/null
+++ b/src/cli/commands/__init__.py
@@ -0,0 +1,32 @@
+"""CLI command modules organized by deployment target.
+
+This package provides the restructured CLI with separate command groups
+for each deployment target (dev, prod, k8s, fly) and utilities (entity, secrets, users).
+
+Command Groups:
+- dev: Development environment using Docker Compose
+- prod: Production Docker Compose deployment
+- k8s: Kubernetes deployment using Helm
+- fly: Fly.io Kubernetes (FKS) deployment (future)
+- entity: Entity/model scaffolding
+- secrets: Secret management utilities
+- users: Keycloak user management (dev environment)
+"""
+
+from .dev import app as dev_app
+from .entity import entity_app
+from .fly import fly_app
+from .k8s import k8s_app
+from .prod import prod_app
+from .secrets import secrets_app
+from .users import users_app
+
+__all__ = [
+    "dev_app",
+    "prod_app",
+    "k8s_app",
+    "fly_app",
+    "entity_app",
+    "secrets_app",
+    "users_app",
+]
diff --git a/src/cli/commands/dev.py b/src/cli/commands/dev.py
new file mode 100644
index 0000000..aeb0a1d
--- /dev/null
+++ b/src/cli/commands/dev.py
@@ -0,0 +1,264 @@
+"""Development environment CLI commands.
+
+This module provides commands for managing the Docker Compose
+development environment including Keycloak, PostgreSQL, Redis, and Temporal.
+
+Commands:
+    up      - Start the development environment
+    down    - Stop the development environment
+    status  - Show status of development services
+    logs    - View logs from a service
+    restart - Restart a specific service
+"""
+
+from pathlib import Path
+
+import typer
+
+from src.cli.deployment import DevDeployer
+from src.cli.deployment.helm_deployer.image_builder import DeploymentError
+
+from .shared import (
+    confirm_action,
+    console,
+    get_project_root,
+    handle_error,
+    print_header,
+)
+
+# Create the dev command group
+app = typer.Typer(
+    name="dev",
+    help="🔧 Development environment commands (Docker Compose)",
+    no_args_is_help=True,
+)
+
+
+def _get_deployer() -> DevDeployer:
+    """Create a DevDeployer instance with current project context."""
+    return DevDeployer(console, Path(get_project_root()))
+
+
+# =============================================================================
+# Commands
+# =============================================================================
+
+
+@app.command()
+def up(
+    force: bool = typer.Option(
+        False,
+        "--force",
+        "-f",
+        help="Force restart even if services are already running",
+    ),
+    no_wait: bool = typer.Option(
+        False,
+        "--no-wait",
+        help="Don't wait for services to be healthy",
+    ),
+    start_server: bool = typer.Option(
+        True,
+        "--start-server/--no-start-server",
+        help="Start FastAPI dev server after services are ready",
+    ),
+) -> None:
+    """🚀 Start the development environment.
+
+    Starts all development services (Keycloak, PostgreSQL, Redis, Temporal)
+    using Docker Compose, then optionally starts the FastAPI development server.
+
+    Examples:
+        # Start everything including dev server
+        api-forge-cli dev up
+
+        # Start services only, no dev server
+        api-forge-cli dev up --no-start-server
+
+        # Force restart all services
+        api-forge-cli dev up --force
+    """
+    print_header("Starting Development Environment")
+
+    try:
+        deployer = _get_deployer()
+        deployer.deploy(force=force, no_wait=no_wait, start_server=start_server)
+    except DeploymentError as e:
+        handle_error(f"Deployment failed: {e.message}", e.details)
+
+
+@app.command()
+def down(
+    volumes: bool = typer.Option(
+        False,
+        "--volumes",
+        "-v",
+        help="Also remove data volumes (DESTROYS ALL DATA)",
+    ),
+    yes: bool = typer.Option(
+        False,
+        "--yes",
+        "-y",
+        help="Skip confirmation prompt",
+    ),
+) -> None:
+    """⏹️  Stop the development environment.
+
+    Stops all Docker Compose services. Use --volumes to also remove
+    persistent data (databases, caches).
+
+    Examples:
+        # Stop services (preserves data)
+        api-forge-cli dev down
+
+        # Stop and remove all data
+        api-forge-cli dev down --volumes
+    """
+    details = "This will stop all development Docker Compose services."
+    extra_warning = None
+
+    if volumes:
+        extra_warning = (
+            "⚠️  --volumes flag is set: ALL DATA WILL BE PERMANENTLY DELETED!\n"
+            "   This includes databases, caches, and any persistent storage."
+        )
+
+    if not confirm_action(
+        action="Stop development environment",
+        details=details,
+        extra_warning=extra_warning,
+        force=yes,
+    ):
+        console.print("[dim]Operation cancelled.[/dim]")
+        raise typer.Exit(0)
+
+    print_header("Stopping Development Environment", style="red")
+
+    try:
+        deployer = _get_deployer()
+        deployer.teardown(volumes=volumes)
+    except DeploymentError as e:
+        handle_error(f"Teardown failed: {e.message}", e.details)
+
+
+@app.command()
+def status() -> None:
+    """📊 Show status of development services.
+
+    Displays the current status of all development services including
+    health check results and connection information.
+
+    Examples:
+        api-forge-cli dev status
+    """
+    deployer = _get_deployer()
+    deployer.show_status()
+
+
+@app.command()
+def logs(
+    service: str = typer.Argument(
+        None,
+        help="Service name (keycloak, postgres, redis, temporal). Shows all if omitted.",
+    ),
+    follow: bool = typer.Option(
+        False,
+        "--follow",
+        "-f",
+        help="Follow log output",
+    ),
+    tail: int = typer.Option(
+        100,
+        "--tail",
+        "-n",
+        help="Number of lines to show from the end",
+    ),
+) -> None:
+    """📜 View logs from development services.
+
+    Shows logs from Docker Compose services. Specify a service name
+    to view logs from a single service.
+
+    Examples:
+        # View all logs
+        api-forge-cli dev logs
+
+        # View PostgreSQL logs
+        api-forge-cli dev logs postgres
+
+        # Follow Keycloak logs
+        api-forge-cli dev logs keycloak --follow
+    """
+    import subprocess
+
+    compose_file = "docker-compose.dev.yml"
+    cmd = ["docker", "compose", "-f", compose_file, "logs"]
+
+    if tail:
+        cmd.extend(["--tail", str(tail)])
+
+    if follow:
+        cmd.append("--follow")
+
+    if service:
+        # Map friendly names to Docker Compose service names
+        service_map = {
+            "keycloak": "keycloak",
+            "postgres": "postgres",
+            "redis": "redis",
+            "temporal": "temporal",
+            "temporal-ui": "temporal-web",
+        }
+        compose_service = service_map.get(service.lower(), service)
+        cmd.append(compose_service)
+
+    try:
+        subprocess.run(cmd, cwd=get_project_root(), check=True)
+    except subprocess.CalledProcessError as e:
+        handle_error(f"Failed to get logs: {e}")
+    except KeyboardInterrupt:
+        pass  # User cancelled with Ctrl+C
+
+
+@app.command()
+def restart(
+    service: str = typer.Argument(
+        ...,
+        help="Service to restart (keycloak, postgres, redis, temporal)",
+    ),
+) -> None:
+    """🔄 Restart a specific development service.
+
+    Restarts a single service without affecting other services.
+
+    Examples:
+        # Restart PostgreSQL
+        api-forge-cli dev restart postgres
+
+        # Restart Keycloak
+        api-forge-cli dev restart keycloak
+    """
+    import subprocess
+
+    compose_file = "docker-compose.dev.yml"
+
+    # Map friendly names to Docker Compose service names
+    service_map = {
+        "keycloak": "keycloak",
+        "postgres": "postgres",
+        "redis": "redis",
+        "temporal": "temporal",
+        "temporal-ui": "temporal-web",
+    }
+
+    compose_service = service_map.get(service.lower(), service)
+
+    console.print(f"[bold]Restarting {service}...[/bold]")
+
+    cmd = ["docker", "compose", "-f", compose_file, "restart", compose_service]
+
+    try:
+        subprocess.run(cmd, cwd=get_project_root(), check=True)
+        console.print(f"[green]✅ {service} restarted successfully[/green]")
+    except subprocess.CalledProcessError as e:
+        handle_error(f"Failed to restart {service}: {e}")
diff --git a/src/cli/entity_commands.py b/src/cli/commands/entity.py
similarity index 99%
rename from src/cli/entity_commands.py
rename to src/cli/commands/entity.py
index 77ae192..c0441ef 100644
--- a/src/cli/entity_commands.py
+++ b/src/cli/commands/entity.py
@@ -10,7 +10,7 @@
 from rich.prompt import Prompt
 from rich.table import Table
 
-from .utils import console, get_project_root
+from .shared import console, get_project_root
 
 # Create the entity command group
 entity_app = typer.Typer(help="🎭 Entity management commands")
diff --git a/src/cli/commands/fly.py b/src/cli/commands/fly.py
new file mode 100644
index 0000000..666a913
--- /dev/null
+++ b/src/cli/commands/fly.py
@@ -0,0 +1,130 @@
+"""Fly.io Kubernetes (FKS) deployment commands.
+
+This module provides a placeholder for future Fly.io Kubernetes Service
+deployment commands. FKS is currently in beta and not yet fully supported.
+
+See docs/fastapi-flyio-kubernetes.md for compatibility analysis.
+"""
+
+from typing import Annotated
+
+import typer
+
+from .shared import console, print_header
+
+# ---------------------------------------------------------------------------
+# Typer App
+# ---------------------------------------------------------------------------
+
+fly_app = typer.Typer(
+    name="fly",
+    help="Fly.io Kubernetes (FKS) deployment commands (coming soon).",
+    no_args_is_help=True,
+)
+
+
+# ---------------------------------------------------------------------------
+# Commands
+# ---------------------------------------------------------------------------
+
+
+@fly_app.command()
+def up(
+    cluster: Annotated[
+        str | None,
+        typer.Option(
+            "--cluster",
+            "-c",
+            help="FKS cluster name",
+        ),
+    ] = None,
+) -> None:
+    """Deploy to Fly.io Kubernetes Service (coming soon).
+
+    This command is a placeholder for future FKS deployment support.
+    FKS is currently in beta with some limitations for our use case.
+
+    See docs/fastapi-flyio-kubernetes.md for details.
+    """
+    print_header("Fly.io Kubernetes Deployment")
+    _show_coming_soon_message()
+
+
+@fly_app.command()
+def down(
+    cluster: Annotated[
+        str | None,
+        typer.Option(
+            "--cluster",
+            "-c",
+            help="FKS cluster name",
+        ),
+    ] = None,
+) -> None:
+    """Remove Fly.io Kubernetes deployment (coming soon).
+
+    This command is a placeholder for future FKS deployment support.
+    """
+    print_header("Removing Fly.io Deployment")
+    _show_coming_soon_message()
+
+
+@fly_app.command()
+def status(
+    cluster: Annotated[
+        str | None,
+        typer.Option(
+            "--cluster",
+            "-c",
+            help="FKS cluster name",
+        ),
+    ] = None,
+) -> None:
+    """Show Fly.io Kubernetes deployment status (coming soon).
+
+    This command is a placeholder for future FKS deployment support.
+    """
+    print_header("Fly.io Deployment Status")
+    _show_coming_soon_message()
+
+
+@fly_app.command()
+def clusters() -> None:
+    """List available FKS clusters (coming soon).
+
+    This command is a placeholder for future FKS support.
+    """
+    print_header("FKS Clusters")
+    _show_coming_soon_message()
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _show_coming_soon_message() -> None:
+    """Display the coming soon message with context."""
+    from rich.panel import Panel
+
+    message = """[yellow]Fly.io Kubernetes Service (FKS) support is planned but not yet implemented.[/yellow]
+
+[bold cyan]Current Status:[/bold cyan]
+• FKS is in public beta
+• Some features we need (e.g., Ingress) require workarounds
+• We're monitoring FKS development for GA readiness
+
+[bold cyan]Key Differences from Standard K8s:[/bold cyan]
+• Uses LoadBalancer instead of Ingress for external access
+• No built-in Ingress controller
+• Uses Fly.io's global edge network for routing
+• Requires flyctl for cluster management
+
+[bold cyan]Next Steps:[/bold cyan]
+1. Review compatibility analysis: docs/fastapi-flyio-kubernetes.md
+2. For standard Kubernetes, use: [green]uv run api-forge-cli k8s up[/green]
+3. For Docker Compose production: [green]uv run api-forge-cli prod up[/green]
+
+[dim]Want to help implement FKS support? Contributions welcome![/dim]"""
+
+    console.print(Panel(message, title="Coming Soon", border_style="yellow"))
diff --git a/src/cli/commands/k8s.py b/src/cli/commands/k8s.py
new file mode 100644
index 0000000..ce40909
--- /dev/null
+++ b/src/cli/commands/k8s.py
@@ -0,0 +1,865 @@
+"""Kubernetes Helm deployment commands.
+
+This module provides commands for deploying, managing, and monitoring
+Kubernetes deployments via Helm.
+"""
+
+from typing import TYPE_CHECKING, Annotated
+
+import typer
+from rich.panel import Panel
+from rich.table import Table
+
+from src.infra.k8s import Kr8sController, run_sync
+
+from .shared import (
+    confirm_action,
+    console,
+    get_project_root,
+    print_header,
+    with_error_handling,
+)
+
+if TYPE_CHECKING:
+    from src.cli.deployment.helm_deployer.deployer import HelmDeployer
+
+
+# ---------------------------------------------------------------------------
+# Kubernetes Controller (module-level singleton)
+# ---------------------------------------------------------------------------
+
+_controller = Kr8sController()
+
+
+# ---------------------------------------------------------------------------
+# Deployer Factory
+# ---------------------------------------------------------------------------
+
+
+def _get_deployer() -> "HelmDeployer":
+    """Get the Helm deployer instance.
+
+    Returns:
+        HelmDeployer instance configured for current project
+    """
+    from src.cli.deployment.helm_deployer.deployer import HelmDeployer
+
+    return HelmDeployer(console, get_project_root())
+
+
+# ---------------------------------------------------------------------------
+# Helper Functions
+# ---------------------------------------------------------------------------
+
+
+def _check_cluster_issuer_ready(issuer_name: str) -> bool:
+    """Check if a ClusterIssuer exists and is ready.
+
+    Args:
+        issuer_name: Name of the ClusterIssuer to check
+
+    Returns:
+        True if the ClusterIssuer exists and is ready, False otherwise
+    """
+    status = run_sync(_controller.get_cluster_issuer_status(issuer_name))
+    return status.exists and status.ready
+
+
+def _check_cert_manager_installed() -> bool:
+    """Check if cert-manager is installed in the cluster.
+
+    Returns:
+        True if cert-manager pods are running, False otherwise
+    """
+    return run_sync(_controller.check_cert_manager_installed())
+
+
+def _install_cert_manager() -> bool:
+    """Install cert-manager using Helm.
+
+    Returns:
+        True if installation succeeded, False otherwise
+    """
+    import subprocess
+
+    console.print("[cyan]Installing cert-manager via Helm...[/cyan]")
+
+    # Add Helm repo
+    subprocess.run(
+        ["helm", "repo", "add", "jetstack", "https://charts.jetstack.io"],
+        capture_output=True,
+        check=False,
+    )
+    subprocess.run(
+        ["helm", "repo", "update"],
+        capture_output=True,
+        check=False,
+    )
+
+    # Install cert-manager
+    result = subprocess.run(
+        [
+            "helm",
+            "install",
+            "cert-manager",
+            "jetstack/cert-manager",
+            "--namespace",
+            "cert-manager",
+            "--create-namespace",
+            "--set",
+            "installCRDs=true",
+            "--wait",
+            "--timeout",
+            "5m",
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    if result.returncode != 0:
+        console.print("[red]Failed to install cert-manager[/red]")
+        if result.stderr:
+            console.print(f"[dim]{result.stderr}[/dim]")
+        return False
+
+    console.print("[green]✓[/green] cert-manager installed successfully")
+    return True
+
+
+def _wait_for_cluster_issuer(issuer_name: str, timeout: int = 60) -> bool:
+    """Wait for a ClusterIssuer to become ready.
+
+    Args:
+        issuer_name: Name of the ClusterIssuer
+        timeout: Maximum seconds to wait
+
+    Returns:
+        True if issuer became ready, False if timeout
+    """
+    import time
+
+    console.print(
+        f"[dim]Waiting for ClusterIssuer '{issuer_name}' to be ready...[/dim]"
+    )
+
+    start = time.time()
+    while time.time() - start < timeout:
+        if _check_cluster_issuer_ready(issuer_name):
+            return True
+        time.sleep(2)
+
+    # Check if it exists but isn't ready
+    yaml_output = run_sync(_controller.get_cluster_issuer_yaml(issuer_name))
+    if yaml_output:
+        console.print("[yellow]ClusterIssuer exists but not ready yet[/yellow]")
+        console.print(f"[dim]{yaml_output}[/dim]")
+
+    return False
+
+
+# ---------------------------------------------------------------------------
+# Typer App
+# ---------------------------------------------------------------------------
+
+k8s_app = typer.Typer(
+    name="k8s",
+    help="Kubernetes Helm deployment commands.",
+    no_args_is_help=True,
+)
+
+
+# ---------------------------------------------------------------------------
+# Commands
+# ---------------------------------------------------------------------------
+
+
+@k8s_app.command()
+@with_error_handling
+def up(
+    namespace: Annotated[
+        str,
+        typer.Option(
+            "--namespace",
+            "-n",
+            help="Kubernetes namespace",
+        ),
+    ] = "api-forge-prod",
+    registry: Annotated[
+        str | None,
+        typer.Option(
+            "--registry",
+            "-r",
+            help="Container registry URL (e.g., ghcr.io/myuser)",
+        ),
+    ] = None,
+    ingress: Annotated[
+        bool,
+        typer.Option(
+            "--ingress",
+            help="Enable Ingress for external access",
+        ),
+    ] = False,
+    ingress_host: Annotated[
+        str | None,
+        typer.Option(
+            "--ingress-host",
+            help="Hostname for Ingress (e.g., api.example.com)",
+        ),
+    ] = None,
+    ingress_tls_secret: Annotated[
+        str | None,
+        typer.Option(
+            "--ingress-tls-secret",
+            help="TLS secret name for HTTPS (manual certificate)",
+        ),
+    ] = None,
+    ingress_tls_auto: Annotated[
+        bool,
+        typer.Option(
+            "--ingress-tls-auto",
+            help="Auto-provision TLS via cert-manager (requires setup-tls first)",
+        ),
+    ] = False,
+    ingress_tls_staging: Annotated[
+        bool,
+        typer.Option(
+            "--ingress-tls-staging",
+            help="Use Let's Encrypt staging (with --ingress-tls-auto)",
+        ),
+    ] = False,
+) -> None:
+    """Deploy to Kubernetes cluster using Helm.
+
+    This command:
+    - Runs pre-deployment validation with cleanup prompts
+    - Builds Docker images with content-based tagging
+    - Loads images into target cluster (Minikube, Kind, or registry)
+    - Deploys Kubernetes secrets
+    - Syncs config.yaml to Helm values
+    - Deploys via Helm upgrade --install
+    - Waits for rollouts to complete
+
+    Examples:
+        uv run api-forge-cli k8s up
+        uv run api-forge-cli k8s up -n my-namespace
+        uv run api-forge-cli k8s up --registry ghcr.io/myuser
+        uv run api-forge-cli k8s up --ingress --ingress-host api.example.com
+        uv run api-forge-cli k8s up --ingress --ingress-host api.example.com --ingress-tls-auto
+    """
+    print_header("Deploying to Kubernetes")
+
+    # Validate TLS options
+    if ingress_tls_auto and ingress_tls_secret:
+        console.print(
+            "[red]Cannot use both --ingress-tls-auto and --ingress-tls-secret[/red]"
+        )
+        raise typer.Exit(1)
+
+    if ingress_tls_auto and not ingress:
+        console.print(
+            "[yellow]--ingress-tls-auto implies --ingress, enabling it[/yellow]"
+        )
+        ingress = True
+
+    if ingress_tls_staging and not ingress_tls_auto:
+        console.print("[red]--ingress-tls-staging requires --ingress-tls-auto[/red]")
+        raise typer.Exit(1)
+
+    # Check cert-manager is ready if using auto TLS
+    if ingress_tls_auto:
+        issuer_name = (
+            "letsencrypt-staging" if ingress_tls_staging else "letsencrypt-prod"
+        )
+        if not _check_cluster_issuer_ready(issuer_name):
+            console.print(
+                f"[red]ClusterIssuer '{issuer_name}' not found or not ready.[/red]"
+            )
+            console.print("\n[dim]Run setup-tls first:[/dim]")
+            staging_flag = " --staging" if ingress_tls_staging else ""
+            console.print(
+                f"  [cyan]uv run api-forge-cli k8s setup-tls --email your@email.com{staging_flag}[/cyan]"
+            )
+            raise typer.Exit(1)
+
+    deployer = _get_deployer()
+    deployer.deploy(
+        namespace=namespace,
+        registry=registry,
+        ingress_enabled=ingress,
+        ingress_host=ingress_host,
+        ingress_tls_secret=ingress_tls_secret,
+        ingress_tls_auto=ingress_tls_auto,
+        ingress_tls_staging=ingress_tls_staging,
+    )
+
+
+@k8s_app.command()
+@with_error_handling
+def down(
+    namespace: Annotated[
+        str,
+        typer.Option(
+            "--namespace",
+            "-n",
+            help="Kubernetes namespace",
+        ),
+    ] = "api-forge-prod",
+    yes: Annotated[
+        bool,
+        typer.Option(
+            "--yes",
+            "-y",
+            help="Skip confirmation prompt",
+        ),
+    ] = False,
+) -> None:
+    """Remove Kubernetes deployment.
+
+    Uninstalls the Helm release and deletes the namespace.
+
+    Examples:
+        uv run api-forge-cli k8s down
+        uv run api-forge-cli k8s down -n my-namespace
+        uv run api-forge-cli k8s down -y  # Skip confirmation
+    """
+    print_header("Removing Kubernetes Deployment")
+
+    if not yes:
+        if not confirm_action(
+            "Remove Kubernetes deployment",
+            f"This will:\n"
+            f"  • Uninstall the Helm release\n"
+            f"  • Delete namespace '{namespace}' and all resources\n"
+            f"  • Remove all persistent volume claims",
+        ):
+            console.print("[dim]Operation cancelled[/dim]")
+            raise typer.Exit(0)
+
+    deployer = _get_deployer()
+    deployer.teardown(namespace=namespace)
+
+
+@k8s_app.command()
+@with_error_handling
+def status(
+    namespace: Annotated[
+        str,
+        typer.Option(
+            "--namespace",
+            "-n",
+            help="Kubernetes namespace",
+        ),
+    ] = "api-forge-prod",
+) -> None:
+    """Show the status of Kubernetes deployment.
+
+    Displays the health and configuration of pods, services, and ingress.
+
+    Examples:
+        uv run api-forge-cli k8s status
+        uv run api-forge-cli k8s status -n my-namespace
+    """
+    print_header("Kubernetes Deployment Status")
+
+    deployer = _get_deployer()
+    deployer.show_status(namespace=namespace)
+
+
+@k8s_app.command()
+@with_error_handling
+def history(
+    namespace: Annotated[
+        str,
+        typer.Option(
+            "--namespace",
+            "-n",
+            help="Kubernetes namespace",
+        ),
+    ] = "api-forge-prod",
+    max_revisions: Annotated[
+        int,
+        typer.Option(
+            "--max",
+            "-m",
+            help="Maximum number of revisions to show",
+        ),
+    ] = 10,
+) -> None:
+    """Show Kubernetes deployment revision history.
+
+    Displays the Helm release history including revision numbers,
+    timestamps, status, and descriptions. Use this to identify
+    which revision to rollback to.
+
+    Examples:
+        uv run api-forge-cli k8s history
+        uv run api-forge-cli k8s history --max 5
+    """
+    print_header("Release History")
+
+    deployer = _get_deployer()
+
+    # Get release history
+    history_data = deployer.commands.helm.history(
+        deployer.constants.HELM_RELEASE_NAME, namespace, max_revisions
+    )
+
+    if not history_data:
+        console.print(
+            f"[yellow]No release history found for '{deployer.constants.HELM_RELEASE_NAME}' "
+            f"in namespace '{namespace}'[/yellow]"
+        )
+        console.print("\n[dim]Deploy first with: uv run api-forge-cli k8s up[/dim]")
+        return
+
+    table = Table(show_header=True, header_style="bold")
+    table.add_column("Revision", justify="right")
+    table.add_column("Updated")
+    table.add_column("Status")
+    table.add_column("Chart")
+    table.add_column("Description")
+
+    for entry in history_data:
+        revision = entry.get("revision", "")
+        updated = entry.get("updated", "")[:19]  # Trim timezone
+        status_str = entry.get("status", "")
+        chart = entry.get("chart", "")
+        description = entry.get("description", "")[:40]
+
+        # Color status
+        if status_str == "deployed":
+            status_display = f"[green]{status_str}[/green]"
+        elif status_str in ("failed", "superseded"):
+            status_display = f"[red]{status_str}[/red]"
+        elif status_str == "pending-upgrade":
+            status_display = f"[yellow]{status_str}[/yellow]"
+        else:
+            status_display = status_str
+
+        table.add_row(str(revision), updated, status_display, chart, description)
+
+    console.print(table)
+
+    # Show rollback hint
+    if len(history_data) > 1:
+        console.print(
+            "\n[dim]To rollback: uv run api-forge-cli k8s rollback <revision>[/dim]"
+        )
+
+
+@k8s_app.command()
+@with_error_handling
+def rollback(
+    revision: Annotated[
+        int | None,
+        typer.Argument(
+            help="Revision number to rollback to (default: previous revision)",
+        ),
+    ] = None,
+    namespace: Annotated[
+        str,
+        typer.Option(
+            "--namespace",
+            "-n",
+            help="Kubernetes namespace",
+        ),
+    ] = "api-forge-prod",
+    yes: Annotated[
+        bool,
+        typer.Option(
+            "--yes",
+            "-y",
+            help="Skip confirmation prompt",
+        ),
+    ] = False,
+) -> None:
+    """Rollback Kubernetes deployment to a previous revision.
+
+    Uses Helm's native rollback functionality to restore
+    the deployment to a previous working state.
+
+    Examples:
+        uv run api-forge-cli k8s rollback          # Previous revision
+        uv run api-forge-cli k8s rollback 3        # Specific revision
+        uv run api-forge-cli k8s history           # View history first
+    """
+    print_header("Rollback Deployment")
+
+    deployer = _get_deployer()
+
+    # Get release history
+    history_data = deployer.commands.helm.history(
+        deployer.constants.HELM_RELEASE_NAME, namespace
+    )
+
+    if not history_data:
+        console.print(
+            f"[red]No release history found for '{deployer.constants.HELM_RELEASE_NAME}' "
+            f"in namespace '{namespace}'[/red]"
+        )
+        console.print("\n[dim]Make sure the release exists and you have access.[/dim]")
+        raise typer.Exit(1)
+
+    # Show current state
+    current = history_data[-1]
+    current_revision = int(current.get("revision", 0))
+
+    if current_revision <= 1:
+        console.print(
+            "[yellow]⚠ Only one revision exists. Nothing to rollback to.[/yellow]"
+        )
+        raise typer.Exit(0)
+
+    # Determine target revision
+    target_revision = revision if revision is not None else current_revision - 1
+
+    if target_revision < 1 or target_revision >= current_revision:
+        console.print(
+            f"[red]Invalid revision {target_revision}. "
+            f"Must be between 1 and {current_revision - 1}.[/red]"
+        )
+        raise typer.Exit(1)
+
+    # Find target revision info
+    target_info = next(
+        (h for h in history_data if int(h.get("revision", 0)) == target_revision), None
+    )
+
+    # Show rollback plan
+    console.print("\n[bold cyan]📋 Rollback Plan[/bold cyan]\n")
+
+    table = Table(show_header=True, header_style="bold")
+    table.add_column("", style="dim")
+    table.add_column("Revision")
+    table.add_column("Status")
+    table.add_column("Description")
+
+    table.add_row(
+        "Current",
+        str(current_revision),
+        current.get("status", "unknown"),
+        current.get("description", "")[:50],
+    )
+
+    if target_info:
+        table.add_row(
+            "Target",
+            str(target_revision),
+            target_info.get("status", "unknown"),
+            target_info.get("description", "")[:50],
+        )
+
+    console.print(table)
+
+    # Confirm
+    if not yes:
+        if not confirm_action(
+            f"Rollback to revision {target_revision}",
+            f"This will restore the deployment in namespace '{namespace}' "
+            f"to revision {target_revision}.\n"
+            "Active pods will be replaced with the previous configuration.",
+        ):
+            console.print("[dim]Rollback cancelled.[/dim]")
+            raise typer.Exit(0)
+
+    # Perform rollback
+    console.print(
+        Panel.fit(
+            f"[bold yellow]⏪ Rolling back to revision {target_revision}[/bold yellow]",
+            border_style="yellow",
+        )
+    )
+
+    result = deployer.commands.helm.rollback(
+        deployer.constants.HELM_RELEASE_NAME,
+        namespace,
+        target_revision,
+        wait=True,
+        timeout="5m",
+    )
+
+    if result.success:
+        console.print(
+            f"\n[bold green]✅ Successfully rolled back to revision {target_revision}![/bold green]"
+        )
+        console.print("\n[dim]Run 'uv run api-forge-cli k8s status' to verify.[/dim]")
+    else:
+        console.print("\n[bold red]❌ Rollback failed[/bold red]")
+        if result.stderr:
+            console.print(Panel(result.stderr, title="Error", border_style="red"))
+        raise typer.Exit(1)
+
+
+@k8s_app.command()
+@with_error_handling
+def logs(
+    pod: Annotated[
+        str | None,
+        typer.Argument(
+            help="Pod name or label selector (e.g., 'app=api-forge')",
+        ),
+    ] = None,
+    namespace: Annotated[
+        str,
+        typer.Option(
+            "--namespace",
+            "-n",
+            help="Kubernetes namespace",
+        ),
+    ] = "api-forge-prod",
+    container: Annotated[
+        str | None,
+        typer.Option(
+            "--container",
+            "-c",
+            help="Container name (if pod has multiple containers)",
+        ),
+    ] = None,
+    follow: Annotated[
+        bool,
+        typer.Option(
+            "--follow",
+            "-f",
+            help="Follow log output",
+        ),
+    ] = False,
+    tail: Annotated[
+        int,
+        typer.Option(
+            "--tail",
+            help="Number of lines to show from the end of the logs",
+        ),
+    ] = 100,
+    previous: Annotated[
+        bool,
+        typer.Option(
+            "--previous",
+            "-p",
+            help="Show logs from previous container instance",
+        ),
+    ] = False,
+) -> None:
+    """View logs from Kubernetes pods.
+
+    Shows logs from pods in the deployment. If no pod is specified,
+    shows logs from all pods with the app label.
+
+    Examples:
+        uv run api-forge-cli k8s logs                    # All app pods
+        uv run api-forge-cli k8s logs api-forge-abc123   # Specific pod
+        uv run api-forge-cli k8s logs -f                 # Follow logs
+        uv run api-forge-cli k8s logs --previous         # Previous container
+    """
+    console.print(f"[dim]Namespace: {namespace}[/dim]\n")
+
+    # Determine label selector for non-specific pod requests
+    label_selector = "app=api-forge" if not pod else None
+
+    try:
+        result = run_sync(
+            _controller.get_pod_logs(
+                namespace=namespace,
+                pod=pod,
+                container=container,
+                label_selector=label_selector,
+                follow=follow,
+                tail=tail,
+                previous=previous,
+            )
+        )
+        if result.stdout:
+            console.print(result.stdout)
+        if not result.success and result.stderr:
+            console.print(f"[red]{result.stderr}[/red]")
+    except KeyboardInterrupt:
+        console.print("\n[dim]Log streaming stopped[/dim]")
+
+
+@k8s_app.command(name="setup-tls")
+@with_error_handling
+def setup_tls(
+    email: Annotated[
+        str | None,
+        typer.Option(
+            "--email",
+            "-e",
+            help="Email for Let's Encrypt certificate notifications (required)",
+        ),
+    ] = None,
+    staging: Annotated[
+        bool,
+        typer.Option(
+            "--staging",
+            help="Use Let's Encrypt staging server (for testing)",
+        ),
+    ] = False,
+    install_cert_manager: Annotated[
+        bool,
+        typer.Option(
+            "--install-cert-manager",
+            help="Automatically install cert-manager if not present",
+        ),
+    ] = True,
+) -> None:
+    """Set up TLS with cert-manager and Let's Encrypt.
+
+    This command:
+    1. Checks if cert-manager is installed (installs via Helm if not)
+    2. Creates a ClusterIssuer for Let's Encrypt
+    3. Waits for the ClusterIssuer to be ready
+
+    After setup, use --ingress-tls-auto with 'k8s up' for automatic certificates.
+
+    Examples:
+        uv run api-forge-cli k8s setup-tls --email admin@example.com
+        uv run api-forge-cli k8s setup-tls --email admin@example.com --staging
+        uv run api-forge-cli k8s up --ingress --ingress-host api.example.com --ingress-tls-auto
+    """
+    print_header("TLS Setup with cert-manager")
+
+    if not email:
+        console.print("[red]Email is required for Let's Encrypt registration.[/red]")
+        console.print("\n[dim]Example:[/dim]")
+        console.print(
+            "  [cyan]uv run api-forge-cli k8s setup-tls --email admin@example.com[/cyan]"
+        )
+        raise typer.Exit(1)
+
+    # Step 1: Check/install cert-manager
+    console.print("\n[bold]Step 1/3:[/bold] Checking cert-manager installation...")
+
+    if _check_cert_manager_installed():
+        console.print("[green]✓[/green] cert-manager is already installed")
+    else:
+        if install_cert_manager:
+            console.print("[yellow]cert-manager not found, installing...[/yellow]")
+            if not _install_cert_manager():
+                raise typer.Exit(1)
+        else:
+            console.print("[red]cert-manager is not installed.[/red]")
+            console.print(
+                "\n[dim]Run with --install-cert-manager or install manually:[/dim]"
+            )
+            console.print(
+                "  helm install cert-manager jetstack/cert-manager "
+                "--namespace cert-manager --create-namespace --set installCRDs=true"
+            )
+            raise typer.Exit(1)
+
+    # Step 2: Create ClusterIssuer
+    console.print("\n[bold]Step 2/3:[/bold] Creating ClusterIssuer...")
+
+    if staging:
+        server = "https://acme-staging-v02.api.letsencrypt.org/directory"
+        issuer_name = "letsencrypt-staging"
+        console.print(
+            "[yellow]Using Let's Encrypt staging server (for testing)[/yellow]"
+        )
+    else:
+        server = "https://acme-v02.api.letsencrypt.org/directory"
+        issuer_name = "letsencrypt-prod"
+        console.print("[cyan]Using Let's Encrypt production server[/cyan]")
+
+    # Check if issuer already exists and is ready
+    if _check_cluster_issuer_ready(issuer_name):
+        console.print(
+            f"[green]✓[/green] ClusterIssuer '{issuer_name}' already exists and is ready"
+        )
+    else:
+        # Create ClusterIssuer manifest file (version-controlled, GitOps-friendly)
+        project_root = get_project_root()
+        cert_manager_dir = project_root / "infra" / "helm" / "cert-manager"
+        cert_manager_dir.mkdir(parents=True, exist_ok=True)
+
+        issuer_file = cert_manager_dir / f"{issuer_name}.yaml"
+
+        cluster_issuer_yaml = f"""# ClusterIssuer for Let's Encrypt TLS certificates
+# Generated by: uv run api-forge-cli k8s setup-tls --email {email}
+# This is a cluster-scoped resource (not namespaced).
+# Apply with: kubectl apply -f {issuer_file.relative_to(project_root)}
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata:
+  name: {issuer_name}
+  labels:
+    app.kubernetes.io/managed-by: api-forge-cli
+spec:
+  acme:
+    # Let's Encrypt ACME server
+    server: {server}
+    # Email for certificate expiration notifications
+    email: {email}
+    # Secret to store the ACME account private key
+    privateKeySecretRef:
+      name: {issuer_name}-account-key
+    # HTTP-01 challenge solver using NGINX ingress
+    solvers:
+    - http01:
+        ingress:
+          class: nginx
+"""
+
+        # Write manifest to file
+        issuer_file.write_text(cluster_issuer_yaml)
+        console.print(
+            f"[dim]Wrote ClusterIssuer manifest to {issuer_file.relative_to(project_root)}[/dim]"
+        )
+
+        # Apply the manifest
+        console.print(f"[dim]Applying ClusterIssuer '{issuer_name}'...[/dim]")
+
+        result = run_sync(_controller.apply_manifest(issuer_file))
+
+        if not result.success:
+            console.print("[red]Failed to create ClusterIssuer[/red]")
+            if result.stderr:
+                console.print(Panel(result.stderr, title="Error", border_style="red"))
+            raise typer.Exit(1)
+
+        console.print(f"[green]✓[/green] ClusterIssuer '{issuer_name}' created")
+
+    # Step 3: Wait for ClusterIssuer to be ready
+    console.print("\n[bold]Step 3/3:[/bold] Waiting for ClusterIssuer to be ready...")
+
+    if _wait_for_cluster_issuer(issuer_name, timeout=60):
+        console.print(f"[green]✓[/green] ClusterIssuer '{issuer_name}' is ready")
+    else:
+        console.print(
+            f"[yellow]⚠ ClusterIssuer '{issuer_name}' created but not ready yet[/yellow]"
+        )
+        console.print(
+            "[dim]This is normal - it will become ready when you create your first certificate.[/dim]"
+        )
+
+    # Success message with next steps
+    console.print("\n" + "=" * 60)
+    console.print("[bold green]✅ TLS setup complete![/bold green]")
+    console.print("=" * 60)
+
+    console.print("\n[bold cyan]Deploy with automatic TLS:[/bold cyan]")
+    staging_flag = " --ingress-tls-staging" if staging else ""
+    console.print(
+        f"  [cyan]uv run api-forge-cli k8s up --ingress --ingress-host api.example.com --ingress-tls-auto{staging_flag}[/cyan]"
+    )
+
+    console.print("\n[bold cyan]What happens next:[/bold cyan]")
+    console.print("  1. Ingress is created with cert-manager annotation")
+    console.print("  2. cert-manager detects the annotation and requests a certificate")
+    console.print("  3. Let's Encrypt validates domain ownership via HTTP-01 challenge")
+    console.print("  4. Certificate is stored in a Kubernetes secret")
+    console.print("  5. NGINX Ingress serves HTTPS automatically")
+    console.print("  6. cert-manager auto-renews before expiry")
+
+    if staging:
+        console.print(
+            "\n[yellow]⚠ Staging certificates are not trusted by browsers.[/yellow]"
+        )
+        console.print(
+            "[yellow]  Run without --staging for production certificates.[/yellow]"
+        )
+
+    console.print("\n[bold cyan]Manifest saved to:[/bold cyan]")
+    console.print(f"  [dim]infra/helm/cert-manager/{issuer_name}.yaml[/dim]")
+    console.print(
+        "  [dim]Commit this file to version control for GitOps workflows.[/dim]"
+    )
diff --git a/src/cli/commands/prod.py b/src/cli/commands/prod.py
new file mode 100644
index 0000000..0e0e2a5
--- /dev/null
+++ b/src/cli/commands/prod.py
@@ -0,0 +1,386 @@
+"""Production Docker Compose environment commands.
+
+This module provides commands for managing the production Docker Compose
+environment: starting services, stopping them, and checking status.
+"""
+
+from typing import TYPE_CHECKING, Annotated
+
+import typer
+
+from .shared import (
+    confirm_action,
+    console,
+    get_project_root,
+    handle_error,
+    print_header,
+    with_error_handling,
+)
+
+if TYPE_CHECKING:
+    from src.cli.deployment.prod_deployer import ProdDeployer
+
+
+# ---------------------------------------------------------------------------
+# Deployer Factory
+# ---------------------------------------------------------------------------
+
+
+def _get_deployer() -> "ProdDeployer":
+    """Get the production deployer instance.
+
+    Returns:
+        ProdDeployer instance configured for current project
+    """
+    from src.cli.deployment.prod_deployer import ProdDeployer
+
+    return ProdDeployer(console, get_project_root())
+
+
+# ---------------------------------------------------------------------------
+# Typer App
+# ---------------------------------------------------------------------------
+
+prod_app = typer.Typer(
+    name="prod",
+    help="Production Docker Compose environment commands.",
+    no_args_is_help=True,
+)
+
+
+# ---------------------------------------------------------------------------
+# Commands
+# ---------------------------------------------------------------------------
+
+
+@prod_app.command()
+@with_error_handling
+def up(
+    skip_build: Annotated[
+        bool,
+        typer.Option(
+            "--skip-build",
+            help="Skip building the application image",
+        ),
+    ] = False,
+    no_wait: Annotated[
+        bool,
+        typer.Option(
+            "--no-wait",
+            help="Don't wait for health checks to complete",
+        ),
+    ] = False,
+    force_recreate: Annotated[
+        bool,
+        typer.Option(
+            "--force-recreate",
+            help="Force recreate containers (useful for secret rotation)",
+        ),
+    ] = False,
+) -> None:
+    """Start the production Docker Compose environment.
+
+    This command:
+    - Ensures required data directories exist
+    - Validates and cleans up stale bind-mount volumes
+    - Builds the application Docker image (unless --skip-build)
+    - Starts all production services with health checks
+    - Monitors service health (unless --no-wait)
+
+    Examples:
+        uv run api-forge-cli prod up
+        uv run api-forge-cli prod up --skip-build --no-wait
+        uv run api-forge-cli prod up --force-recreate  # For secret rotation
+    """
+    print_header("Starting Production Environment")
+
+    deployer = _get_deployer()
+    deployer.deploy(
+        skip_build=skip_build,
+        no_wait=no_wait,
+        force_recreate=force_recreate,
+    )
+
+
+@prod_app.command()
+@with_error_handling
+def down(
+    volumes: Annotated[
+        bool,
+        typer.Option(
+            "--volumes",
+            "-v",
+            help="Also remove data volumes and directories (DESTRUCTIVE)",
+        ),
+    ] = False,
+    yes: Annotated[
+        bool,
+        typer.Option(
+            "--yes",
+            "-y",
+            help="Skip confirmation prompt for destructive operations",
+        ),
+    ] = False,
+) -> None:
+    """Stop the production Docker Compose environment.
+
+    By default, this preserves all data volumes so you can restart later
+    without losing data. Use --volumes to also remove data (requires confirmation).
+
+    Examples:
+        uv run api-forge-cli prod down
+        uv run api-forge-cli prod down --volumes  # Remove data too
+        uv run api-forge-cli prod down -v -y      # Remove data without prompt
+    """
+    print_header("Stopping Production Environment")
+
+    if volumes and not yes:
+        if not confirm_action(
+            "Remove data volumes",
+            "This will permanently delete all production data including:\n"
+            "  • PostgreSQL database\n"
+            "  • Redis cache and sessions\n"
+            "  • Application logs\n"
+            "  • SSL certificates",
+        ):
+            console.print("[dim]Operation cancelled[/dim]")
+            raise typer.Exit(0)
+
+    deployer = _get_deployer()
+    deployer.teardown(volumes=volumes)
+
+
+@prod_app.command()
+@with_error_handling
+def status() -> None:
+    """Show the status of production services.
+
+    Displays the health and configuration of each production service.
+
+    Examples:
+        uv run api-forge-cli prod status
+    """
+    print_header("Production Environment Status")
+
+    deployer = _get_deployer()
+    deployer.show_status()
+
+
+@prod_app.command()
+@with_error_handling
+def logs(
+    service: Annotated[
+        str | None,
+        typer.Argument(
+            help="Service name to view logs for (e.g., app, postgres, redis, temporal)",
+        ),
+    ] = None,
+    follow: Annotated[
+        bool,
+        typer.Option(
+            "--follow",
+            "-f",
+            help="Follow log output",
+        ),
+    ] = False,
+    tail: Annotated[
+        int,
+        typer.Option(
+            "--tail",
+            "-n",
+            help="Number of lines to show from the end of the logs",
+        ),
+    ] = 100,
+) -> None:
+    """View logs from production services.
+
+    Shows logs from the production Docker Compose environment. Optionally
+    specify a service name to filter logs.
+
+    Examples:
+        uv run api-forge-cli prod logs           # All services
+        uv run api-forge-cli prod logs app       # Just the app service
+        uv run api-forge-cli prod logs app -f    # Follow app logs
+        uv run api-forge-cli prod logs -n 50     # Last 50 lines
+    """
+    import subprocess
+
+    project_root = get_project_root()
+    compose_file = project_root / "docker-compose.prod.yml"
+
+    if not compose_file.exists():
+        handle_error(f"Compose file not found: {compose_file}")
+        raise typer.Exit(1)
+
+    cmd = [
+        "docker",
+        "compose",
+        "-p",
+        "api-forge-prod",
+        "-f",
+        str(compose_file),
+        "logs",
+        f"--tail={tail}",
+    ]
+
+    if follow:
+        cmd.append("--follow")
+
+    if service:
+        cmd.append(service)
+        console.print(f"[dim]Showing logs for service: {service}[/dim]\n")
+    else:
+        console.print("[dim]Showing logs for all production services[/dim]\n")
+
+    try:
+        subprocess.run(cmd, check=True)
+    except subprocess.CalledProcessError as e:
+        handle_error(f"Failed to retrieve logs: {e}")
+        raise typer.Exit(1) from e
+    except KeyboardInterrupt:
+        console.print("\n[dim]Log streaming stopped[/dim]")
+
+
+@prod_app.command()
+@with_error_handling
+def restart(
+    service: Annotated[
+        str | None,
+        typer.Argument(
+            help="Service name to restart (restarts all if not specified)",
+        ),
+    ] = None,
+    force_recreate: Annotated[
+        bool,
+        typer.Option(
+            "--force-recreate",
+            help="Force recreate containers",
+        ),
+    ] = False,
+) -> None:
+    """Restart production services.
+
+    Restarts one or all production services. Useful for picking up
+    configuration changes.
+
+    Examples:
+        uv run api-forge-cli prod restart          # Restart all
+        uv run api-forge-cli prod restart app      # Just restart app
+        uv run api-forge-cli prod restart --force-recreate
+    """
+    import subprocess
+
+    project_root = get_project_root()
+    compose_file = project_root / "docker-compose.prod.yml"
+
+    if not compose_file.exists():
+        handle_error(f"Compose file not found: {compose_file}")
+        raise typer.Exit(1)
+
+    if service:
+        console.print(f"[cyan]Restarting service: {service}[/cyan]")
+        cmd = [
+            "docker",
+            "compose",
+            "-p",
+            "api-forge-prod",
+            "-f",
+            str(compose_file),
+            "restart",
+            service,
+        ]
+    elif force_recreate:
+        # Full restart with force-recreate
+        console.print("[cyan]Force restarting all production services...[/cyan]")
+        cmd = [
+            "docker",
+            "compose",
+            "-p",
+            "api-forge-prod",
+            "-f",
+            str(compose_file),
+            "up",
+            "-d",
+            "--force-recreate",
+        ]
+    else:
+        console.print("[cyan]Restarting all production services...[/cyan]")
+        cmd = [
+            "docker",
+            "compose",
+            "-p",
+            "api-forge-prod",
+            "-f",
+            str(compose_file),
+            "restart",
+        ]
+
+    try:
+        subprocess.run(cmd, check=True)
+        console.print("[green]✓[/green] Restart complete")
+    except subprocess.CalledProcessError as e:
+        handle_error(f"Failed to restart services: {e}")
+        raise typer.Exit(1) from e
+
+
+@prod_app.command()
+@with_error_handling
+def build(
+    service: Annotated[
+        str | None,
+        typer.Argument(
+            help="Service name to build (builds all if not specified)",
+        ),
+    ] = None,
+    no_cache: Annotated[
+        bool,
+        typer.Option(
+            "--no-cache",
+            help="Build without using cache",
+        ),
+    ] = False,
+) -> None:
+    """Build production Docker images.
+
+    Builds one or all production service images. Useful for rebuilding
+    after Dockerfile changes.
+
+    Examples:
+        uv run api-forge-cli prod build           # Build all
+        uv run api-forge-cli prod build app       # Just build app
+        uv run api-forge-cli prod build --no-cache
+    """
+    import subprocess
+
+    project_root = get_project_root()
+    compose_file = project_root / "docker-compose.prod.yml"
+
+    if not compose_file.exists():
+        handle_error(f"Compose file not found: {compose_file}")
+        raise typer.Exit(1)
+
+    cmd = [
+        "docker",
+        "compose",
+        "-p",
+        "api-forge-prod",
+        "-f",
+        str(compose_file),
+        "build",
+    ]
+
+    if no_cache:
+        cmd.append("--no-cache")
+
+    if service:
+        cmd.append(service)
+        console.print(f"[cyan]Building service: {service}[/cyan]")
+    else:
+        console.print("[cyan]Building all production images...[/cyan]")
+
+    try:
+        subprocess.run(cmd, check=True)
+        console.print("[green]✓[/green] Build complete")
+    except subprocess.CalledProcessError as e:
+        handle_error(f"Build failed: {e}")
+        raise typer.Exit(1) from e
diff --git a/src/cli/secrets_commands.py b/src/cli/commands/secrets.py
similarity index 99%
rename from src/cli/secrets_commands.py
rename to src/cli/commands/secrets.py
index 34064f6..a4b8849 100644
--- a/src/cli/secrets_commands.py
+++ b/src/cli/commands/secrets.py
@@ -7,7 +7,7 @@
 from rich.panel import Panel
 from rich.table import Table
 
-from .utils import confirm_destructive_action, console, get_project_root
+from .shared import confirm_destructive_action, console, get_project_root
 
 # Create the secrets command group
 secrets_app = typer.Typer(help="🔐 Secrets management commands")
diff --git a/src/cli/commands/shared.py b/src/cli/commands/shared.py
new file mode 100644
index 0000000..ae2a862
--- /dev/null
+++ b/src/cli/commands/shared.py
@@ -0,0 +1,143 @@
+"""Shared utilities for CLI commands.
+
+This module provides common utilities used across all command modules,
+including console output, confirmation dialogs, and path resolution.
+"""
+
+from collections.abc import Callable
+from pathlib import Path
+
+import typer
+from rich.console import Console
+from rich.panel import Panel
+
+# Shared console instance for consistent output
+console = Console()
+
+
+def get_project_root() -> Path:
+    """Get the project root directory.
+
+    Walks up from the module location to find the project root,
+    identified by the presence of pyproject.toml.
+
+    Returns:
+        Path to the project root directory
+    """
+    current = Path(__file__).resolve()
+
+    # Walk up the directory tree looking for pyproject.toml
+    for parent in [current, *current.parents]:
+        if (parent / "pyproject.toml").exists():
+            return parent
+
+    # Fallback to four levels up (src/cli/commands/shared.py -> project root)
+    return Path(__file__).parent.parent.parent.parent
+
+
+def confirm_action(
+    action: str,
+    details: str | None = None,
+    extra_warning: str | None = None,
+    force: bool = False,
+) -> bool:
+    """Prompt user to confirm a potentially destructive action.
+
+    Args:
+        action: Description of the action (e.g., "Stop all services")
+        details: Additional details about what will be affected
+        extra_warning: Extra warning message (e.g., for data loss)
+        force: If True, skip the confirmation prompt
+
+    Returns:
+        True if the user confirmed, False otherwise
+    """
+    if force:
+        return True
+
+    # Build warning message
+    warning_lines = [f"[bold red]⚠️  {action}[/bold red]"]
+
+    if details:
+        warning_lines.append(f"\n{details}")
+
+    if extra_warning:
+        warning_lines.append(f"\n[yellow]{extra_warning}[/yellow]")
+
+    console.print(
+        Panel(
+            "\n".join(warning_lines),
+            title="Confirmation Required",
+            border_style="red",
+        )
+    )
+
+    try:
+        response = console.input(
+            "\n[bold]Are you sure you want to proceed?[/bold] \\[y/N]: "
+        )
+        return response.strip().lower() in ("y", "yes")
+    except (KeyboardInterrupt, EOFError):
+        console.print("\n[dim]Cancelled.[/dim]")
+        return False
+
+
+# Alias for backward compatibility
+confirm_destructive_action = confirm_action
+
+
+def handle_error(message: str, details: str | None = None, exit_code: int = 1) -> None:
+    """Handle an error by printing a message and exiting.
+
+    Args:
+        message: Error message to display
+        details: Optional additional details
+        exit_code: Exit code to use
+    """
+    console.print(f"\n[bold red]❌ {message}[/bold red]\n")
+    if details:
+        console.print(Panel(details, title="Details", border_style="red"))
+    raise typer.Exit(exit_code)
+
+
+def print_header(title: str, style: str = "blue") -> None:
+    """Print a styled header panel.
+
+    Args:
+        title: Header title text
+        style: Border style color
+    """
+    console.print(
+        Panel.fit(
+            f"[bold {style}]{title}[/bold {style}]",
+            border_style=style,
+        )
+    )
+
+
+def with_error_handling(func: Callable[..., None]) -> Callable[..., None]:
+    """Decorator to wrap command functions with standard error handling.
+
+    Catches common exceptions and formats them consistently.
+
+    Args:
+        func: The command function to wrap
+
+    Returns:
+        Wrapped function with error handling
+    """
+    from functools import wraps
+
+    from src.cli.deployment.helm_deployer.image_builder import DeploymentError
+
+    @wraps(func)
+    def wrapper(*args: object, **kwargs: object) -> None:
+        try:
+            func(*args, **kwargs)
+        except DeploymentError as e:
+            handle_error(e.message, e.details)
+        except KeyboardInterrupt:
+            console.print("\n[dim]Operation cancelled by user.[/dim]")
+            raise typer.Exit(130) from None
+
+    return wrapper
diff --git a/src/cli/user_commands.py b/src/cli/commands/users.py
similarity index 99%
rename from src/cli/user_commands.py
rename to src/cli/commands/users.py
index 640be34..a60e39a 100644
--- a/src/cli/user_commands.py
+++ b/src/cli/commands/users.py
@@ -1,13 +1,12 @@
 """Keycloak user management CLI commands."""
 
 import typer
-from rich.console import Console
 from rich.prompt import Confirm
 from rich.table import Table
 
 from src.dev.keycloak_client import KeycloakClient
 
-console = Console()
+from .shared import console
 
 # Create the users subcommand app
 users_app = typer.Typer(help="Manage Keycloak users in development environment")
diff --git a/src/cli/deploy_commands.py b/src/cli/deploy_commands.py
deleted file mode 100644
index 2a76d83..0000000
--- a/src/cli/deploy_commands.py
+++ /dev/null
@@ -1,628 +0,0 @@
-"""Deployment CLI commands for dev, prod, and k8s environments."""
-
-import subprocess
-import sys
-from enum import Enum
-from pathlib import Path
-
-import typer
-from rich.panel import Panel
-
-from .deployment import DevDeployer, HelmDeployer, ProdDeployer
-from .deployment.helm_deployer.image_builder import DeploymentError
-from .utils import confirm_destructive_action, console, get_project_root
-
-# Create the deploy command group
-deploy_app = typer.Typer(help="🚀 Deployment commands for different environments")
-
-
-class Environment(str, Enum):
-    """Deployment environment options."""
-
-    DEV = "dev"
-    PROD = "prod"
-    K8S = "k8s"
-
-
-@deploy_app.command()
-def up(
-    env: Environment = typer.Argument(
-        ..., help="Environment to deploy (dev, prod, or k8s)"
-    ),
-    force: bool = typer.Option(
-        False, "--force", help="Force restart even if services are running (dev only)"
-    ),
-    no_wait: bool = typer.Option(
-        False, "--no-wait", help="Don't wait for services to be ready"
-    ),
-    start_server: bool = typer.Option(
-        True,
-        "--start-server/--no-start-server",
-        help="Start FastAPI dev server after deploying services (dev only)",
-    ),
-    skip_build: bool = typer.Option(
-        False, "--skip-build", help="Skip building the app image (prod only)"
-    ),
-    force_recreate: bool = typer.Option(
-        False,
-        "--force-recreate",
-        help="Force recreate containers to pick up new secrets (prod/k8s only)",
-    ),
-    namespace: str = typer.Option(
-        "api-forge-prod", "--namespace", "-n", help="Kubernetes namespace (k8s only)"
-    ),
-    registry: str = typer.Option(
-        None,
-        "--registry",
-        "-r",
-        help="Container registry for remote k8s clusters (e.g., ghcr.io/myuser)",
-    ),
-    ingress: bool = typer.Option(
-        False,
-        "--ingress/--no-ingress",
-        help="Enable Ingress for external access (k8s only)",
-    ),
-    ingress_host: str = typer.Option(
-        None,
-        "--ingress-host",
-        help="Ingress hostname (k8s only, e.g., api.example.com)",
-    ),
-    ingress_tls_secret: str = typer.Option(
-        None,
-        "--ingress-tls-secret",
-        help="TLS secret name for HTTPS (k8s only)",
-    ),
-) -> None:
-    """
-    🚀 Deploy the application to the specified environment.
-
-    Environments:
-    - dev: Development environment with hot reload
-    - prod: Production-like Docker Compose environment
-    - k8s: Kubernetes cluster deployment
-
-    For k8s deployments, the cluster type is auto-detected:
-    - Minikube/Kind: Images loaded directly into cluster cache
-    - Remote clusters: Use --registry to push images to a container registry
-    """
-    project_root = Path(get_project_root())
-
-    # Display header
-    env_name = env.value.upper()
-    console.print(
-        Panel.fit(
-            f"[bold blue]Deploying {env_name} Environment[/bold blue]",
-            border_style="blue",
-        )
-    )
-
-    # Create appropriate deployer and execute deployment
-    try:
-        deployer: DevDeployer | ProdDeployer | HelmDeployer
-        if env == Environment.DEV:
-            deployer = DevDeployer(console, project_root)
-            deployer.deploy(force=force, no_wait=no_wait, start_server=start_server)
-
-        elif env == Environment.PROD:
-            deployer = ProdDeployer(console, project_root)
-            deployer.deploy(
-                skip_build=skip_build, no_wait=no_wait, force_recreate=force_recreate
-            )
-
-        elif env == Environment.K8S:
-            deployer = HelmDeployer(console, project_root)
-            deployer.deploy(
-                namespace=namespace,
-                no_wait=no_wait,
-                force_recreate=force_recreate,
-                registry=registry,
-                ingress_enabled=ingress,
-                ingress_host=ingress_host,
-                ingress_tls_secret=ingress_tls_secret,
-            )
-
-    except DeploymentError as e:
-        console.print(f"\n[bold red]❌ Deployment failed: {e.message}[/bold red]\n")
-        if e.details:
-            console.print(Panel(e.details, title="Details", border_style="red"))
-        sys.exit(1)
-
-
-@deploy_app.command()
-def down(
-    env: Environment = typer.Argument(
-        ..., help="Environment to stop (dev, prod, or k8s)"
-    ),
-    namespace: str = typer.Option(
-        "api-forge-prod", "--namespace", "-n", help="Kubernetes namespace (k8s only)"
-    ),
-    volumes: bool = typer.Option(
-        False, "--volumes", "-v", help="Remove volumes/PVCs along with deployment"
-    ),
-    yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation prompt"),
-) -> None:
-    """
-    ⏹️  Stop services in the specified environment.
-
-    Environments:
-    - dev: Stop development Docker Compose services
-    - prod: Stop production Docker Compose services and optionally volumes
-    - k8s: Delete Kubernetes deployment and optionally PVCs
-    """
-    project_root = Path(get_project_root())
-    env_name = env.value.upper()
-
-    # Build confirmation details
-    if env == Environment.K8S:
-        details = f"This will stop all services in namespace '{namespace}'."
-    else:
-        details = f"This will stop all {env_name} Docker Compose services."
-
-    extra_warning = None
-    if volumes:
-        extra_warning = (
-            "⚠️  --volumes flag is set: ALL DATA WILL BE PERMANENTLY DELETED!\n"
-            "   This includes databases, caches, and any persistent storage."
-        )
-
-    # Confirm destructive action
-    if not confirm_destructive_action(
-        action=f"Stop {env_name} environment",
-        details=details,
-        extra_warning=extra_warning,
-        force=yes,
-    ):
-        console.print("[dim]Operation cancelled.[/dim]")
-        raise typer.Exit(0)
-
-    # Display header
-    console.print(
-        Panel.fit(
-            f"[bold red]Stopping {env_name} Environment[/bold red]",
-            border_style="red",
-        )
-    )
-
-    # Create appropriate deployer and execute teardown
-    try:
-        deployer: DevDeployer | ProdDeployer | HelmDeployer
-        if env == Environment.DEV:
-            deployer = DevDeployer(console, project_root)
-            deployer.teardown(volumes=volumes)
-
-        elif env == Environment.PROD:
-            deployer = ProdDeployer(console, project_root)
-            deployer.teardown(volumes=volumes)
-
-        elif env == Environment.K8S:
-            deployer = HelmDeployer(console, project_root)
-            deployer.teardown(namespace=namespace, volumes=volumes)
-
-    except DeploymentError as e:
-        console.print(f"\n[bold red]❌ Teardown failed: {e.message}[/bold red]\n")
-        if e.details:
-            console.print(Panel(e.details, title="Details", border_style="red"))
-        sys.exit(1)
-
-
-@deploy_app.command()
-def status(
-    env: Environment = typer.Argument(
-        ..., help="Environment to check status (dev, prod, or k8s)"
-    ),
-    namespace: str = typer.Option(
-        "api-forge-prod", "--namespace", "-n", help="Kubernetes namespace (k8s only)"
-    ),
-) -> None:
-    """
-    📊 Show status of services in the specified environment.
-
-    Environments:
-    - dev: Show development Docker Compose services status
-    - prod: Show production Docker Compose services status
-    - k8s: Show Kubernetes deployment status
-    """
-    project_root = Path(get_project_root())
-
-    # Create appropriate deployer and show status
-    deployer: DevDeployer | ProdDeployer | HelmDeployer
-    if env == Environment.DEV:
-        deployer = DevDeployer(console, project_root)
-        deployer.show_status()
-
-    elif env == Environment.PROD:
-        deployer = ProdDeployer(console, project_root)
-        deployer.show_status()
-
-    elif env == Environment.K8S:
-        deployer = HelmDeployer(console, project_root)
-        deployer.show_status(namespace)
-
-
-@deploy_app.command()
-def rotate(
-    env: Environment = typer.Argument(
-        ..., help="Environment to rotate secrets for (prod or k8s)"
-    ),
-    redeploy: bool = typer.Option(
-        True, "--redeploy/--no-redeploy", help="Automatically redeploy after rotation"
-    ),
-    force: bool = typer.Option(
-        True, "--force/--no-force", help="Force overwrite existing secrets"
-    ),
-    backup: bool = typer.Option(
-        True, "--backup/--no-backup", help="Backup existing secrets before rotation"
-    ),
-    namespace: str = typer.Option(
-        "api-forge-prod", "--namespace", "-n", help="Kubernetes namespace (k8s only)"
-    ),
-    yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation prompt"),
-) -> None:
-    """
-    🔐 Rotate secrets and optionally redeploy.
-
-    This command:
-    1. Generates new cryptographically secure secrets
-    2. Optionally backs up existing secrets
-    3. Optionally redeploys the environment to pick up new secrets
-
-    Environments:
-    - prod: Rotate Docker Compose production secrets
-    - k8s: Rotate Kubernetes secrets
-
-    Example usage:
-        # Rotate and redeploy prod (default behavior)
-        uv run api-forge-cli deploy rotate prod
-
-        # Rotate without redeploying
-        uv run api-forge-cli deploy rotate prod --no-redeploy
-
-        # Rotate k8s secrets with backup
-        uv run api-forge-cli deploy rotate k8s --backup
-    """
-    project_root = Path(get_project_root())
-    secrets_script = project_root / "infra" / "secrets" / "generate_secrets.sh"
-
-    if not secrets_script.exists():
-        console.print(
-            f"[red]✗[/red] Secret generation script not found at {secrets_script}"
-        )
-        raise typer.Exit(1)
-
-    if env == Environment.DEV:
-        console.print(
-            "[yellow]⚠[/yellow] Secret rotation is not needed for dev environment"
-        )
-        console.print("   Dev environment uses hardcoded test credentials")
-        raise typer.Exit(0)
-
-    # Confirm destructive action
-    env_name = env.value.upper()
-    details = (
-        "This will regenerate all production secrets including:\n"
-        "  • Database passwords\n"
-        "  • Session signing secrets\n"
-        "  • CSRF signing secrets\n"
-        "  • OIDC client secrets"
-    )
-    extra_warning = (
-        "⚠️  Existing secrets will be overwritten!\n"
-        "   Running services will need to be restarted to use new secrets."
-    )
-    if not backup:
-        extra_warning += "\n   --no-backup: Old secrets will NOT be backed up!"
-
-    if not confirm_destructive_action(
-        action=f"Rotate {env_name} secrets",
-        details=details,
-        extra_warning=extra_warning,
-        force=yes,
-    ):
-        console.print("[dim]Operation cancelled.[/dim]")
-        raise typer.Exit(0)
-
-    # Display header
-    env_name = env.value.upper()
-    console.print(
-        Panel.fit(
-            f"[bold yellow]🔐 Rotating {env_name} Secrets[/bold yellow]",
-            border_style="yellow",
-        )
-    )
-
-    # Step 1: Backup existing secrets (if requested)
-    if backup:
-        console.print("\n[bold]Step 1/3:[/bold] Backing up existing secrets...")
-        backup_cmd = [str(secrets_script), "--backup-only"]
-        try:
-            result = subprocess.run(
-                backup_cmd,
-                cwd=secrets_script.parent,
-                capture_output=True,
-                text=True,
-                check=True,
-            )
-            console.print("[green]✓[/green] Backup complete")
-            if result.stdout:
-                console.print(result.stdout)
-        except subprocess.CalledProcessError as e:
-            console.print(
-                f"[yellow]⚠[/yellow] Backup failed (continuing anyway): {e.stderr}"
-            )
-
-    # Step 2: Generate new secrets
-    console.print(
-        f"\n[bold]Step {'2/3' if backup else '1/2'}:[/bold] Generating new secrets..."
-    )
-    generate_cmd = [str(secrets_script)]
-    if force:
-        generate_cmd.append("--force")
-
-    try:
-        subprocess.run(
-            generate_cmd,
-            cwd=secrets_script.parent,
-            capture_output=False,  # Show output in real-time
-            text=True,
-            check=True,
-        )
-        console.print("[green]✓[/green] New secrets generated")
-    except subprocess.CalledProcessError as e:
-        console.print(f"[red]✗[/red] Secret generation failed: {e}")
-        raise typer.Exit(1) from e
-
-    # Step 3: Redeploy (if requested)
-    if redeploy:
-        console.print(
-            f"\n[bold]Step {'3/3' if backup else '2/2'}:[/bold] Redeploying with new secrets..."
-        )
-
-        deployer: DevDeployer | ProdDeployer | HelmDeployer
-        if env == Environment.PROD:
-            deployer = ProdDeployer(console, project_root)
-            deployer.deploy(skip_build=False, no_wait=False, force_recreate=True)
-
-        elif env == Environment.K8S:
-            deployer = HelmDeployer(console, project_root)
-            deployer.deploy(namespace=namespace, no_wait=False, force_recreate=True)
-
-        console.print(
-            "\n[bold green]🎉 Secret rotation and redeployment complete![/bold green]"
-        )
-    else:
-        console.print(
-            "\n[bold yellow]⚠[/bold yellow] Secrets rotated but not deployed."
-        )
-        console.print("   Run the following command to deploy with new secrets:")
-        if env == Environment.PROD:
-            console.print(
-                "   [cyan]uv run api-forge-cli deploy up prod --force-recreate[/cyan]"
-            )
-        elif env == Environment.K8S:
-            console.print(
-                f"   [cyan]uv run api-forge-cli deploy up k8s --force-recreate -n {namespace}[/cyan]"
-            )
-
-
-@deploy_app.command()
-def rollback(
-    revision: int = typer.Argument(
-        None, help="Revision number to rollback to (default: previous revision)"
-    ),
-    namespace: str = typer.Option(
-        "api-forge-prod", "--namespace", "-n", help="Kubernetes namespace"
-    ),
-    yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation prompt"),
-) -> None:
-    """
-    ⏪ Rollback Kubernetes deployment to a previous revision.
-
-    This command uses Helm's native rollback functionality to restore
-    the deployment to a previous working state.
-
-    Examples:
-        # Rollback to the previous revision
-        uv run api-forge-cli deploy rollback
-
-        # Rollback to a specific revision
-        uv run api-forge-cli deploy rollback 3
-
-        # View revision history first
-        uv run api-forge-cli deploy history
-    """
-    from rich.table import Table
-
-    from .deployment import HelmDeployer
-
-    project_root = Path(get_project_root())
-    deployer = HelmDeployer(console, project_root)
-
-    # Get release history
-    history = deployer.commands.helm.history(
-        deployer.constants.HELM_RELEASE_NAME, namespace
-    )
-
-    if not history:
-        console.print(
-            f"[red]No release history found for '{deployer.constants.HELM_RELEASE_NAME}' "
-            f"in namespace '{namespace}'[/red]"
-        )
-        console.print("\n[dim]Make sure the release exists and you have access.[/dim]")
-        raise typer.Exit(1)
-
-    # Show current state
-    current = history[-1]
-    current_revision = int(current.get("revision", 0))
-
-    if current_revision <= 1:
-        console.print(
-            "[yellow]⚠ Only one revision exists. Nothing to rollback to.[/yellow]"
-        )
-        raise typer.Exit(0)
-
-    # Determine target revision
-    target_revision = revision if revision is not None else current_revision - 1
-
-    if target_revision < 1 or target_revision >= current_revision:
-        console.print(
-            f"[red]Invalid revision {target_revision}. "
-            f"Must be between 1 and {current_revision - 1}.[/red]"
-        )
-        raise typer.Exit(1)
-
-    # Find target revision info
-    target_info = next(
-        (h for h in history if int(h.get("revision", 0)) == target_revision), None
-    )
-
-    # Show rollback plan
-    console.print("\n[bold cyan]📋 Rollback Plan[/bold cyan]\n")
-
-    table = Table(show_header=True, header_style="bold")
-    table.add_column("", style="dim")
-    table.add_column("Revision")
-    table.add_column("Status")
-    table.add_column("Description")
-
-    table.add_row(
-        "Current",
-        str(current_revision),
-        current.get("status", "unknown"),
-        current.get("description", "")[:50],
-    )
-
-    if target_info:
-        table.add_row(
-            "Target",
-            str(target_revision),
-            target_info.get("status", "unknown"),
-            target_info.get("description", "")[:50],
-        )
-
-    console.print(table)
-
-    # Confirm
-    if not confirm_destructive_action(
-        action=f"Rollback to revision {target_revision}",
-        details=f"This will restore the deployment in namespace '{namespace}' to revision {target_revision}.",
-        extra_warning="Active pods will be replaced with the previous configuration.",
-        force=yes,
-    ):
-        console.print("[dim]Rollback cancelled.[/dim]")
-        raise typer.Exit(0)
-
-    # Perform rollback
-    console.print(
-        Panel.fit(
-            f"[bold yellow]⏪ Rolling back to revision {target_revision}[/bold yellow]",
-            border_style="yellow",
-        )
-    )
-
-    result = deployer.commands.helm.rollback(
-        deployer.constants.HELM_RELEASE_NAME,
-        namespace,
-        target_revision,
-        wait=True,
-        timeout="5m",
-    )
-
-    if result.success:
-        console.print(
-            f"\n[bold green]✅ Successfully rolled back to revision {target_revision}![/bold green]"
-        )
-        console.print(
-            "\n[dim]Run 'uv run api-forge-cli deploy status k8s' to verify.[/dim]"
-        )
-    else:
-        console.print("\n[bold red]❌ Rollback failed[/bold red]")
-        if result.stderr:
-            console.print(Panel(result.stderr, title="Error", border_style="red"))
-        raise typer.Exit(1)
-
-
-@deploy_app.command()
-def history(
-    namespace: str = typer.Option(
-        "api-forge-prod", "--namespace", "-n", help="Kubernetes namespace"
-    ),
-    max_revisions: int = typer.Option(
-        10, "--max", "-m", help="Maximum number of revisions to show"
-    ),
-) -> None:
-    """
-    📜 Show Kubernetes deployment revision history.
-
-    Displays the Helm release history including revision numbers,
-    timestamps, status, and descriptions. Use this to identify
-    which revision to rollback to.
-
-    Examples:
-        # Show last 10 revisions
-        uv run api-forge-cli deploy history
-
-        # Show last 5 revisions
-        uv run api-forge-cli deploy history --max 5
-    """
-    from rich.table import Table
-
-    from .deployment import HelmDeployer
-
-    project_root = Path(get_project_root())
-    deployer = HelmDeployer(console, project_root)
-
-    # Get release history
-    history_data = deployer.commands.helm.history(
-        deployer.constants.HELM_RELEASE_NAME, namespace, max_revisions
-    )
-
-    if not history_data:
-        console.print(
-            f"[yellow]No release history found for '{deployer.constants.HELM_RELEASE_NAME}' "
-            f"in namespace '{namespace}'[/yellow]"
-        )
-        console.print(
-            "\n[dim]Deploy first with: uv run api-forge-cli deploy up k8s[/dim]"
-        )
-        return
-
-    console.print(
-        Panel.fit(
-            f"[bold cyan]📜 Release History: {deployer.constants.HELM_RELEASE_NAME}[/bold cyan]",
-            border_style="cyan",
-        )
-    )
-
-    table = Table(show_header=True, header_style="bold")
-    table.add_column("Revision", justify="right")
-    table.add_column("Updated")
-    table.add_column("Status")
-    table.add_column("Chart")
-    table.add_column("Description")
-
-    for entry in history_data:
-        revision = entry.get("revision", "")
-        updated = entry.get("updated", "")[:19]  # Trim timezone
-        status = entry.get("status", "")
-        chart = entry.get("chart", "")
-        description = entry.get("description", "")[:40]
-
-        # Color status
-        if status == "deployed":
-            status_display = f"[green]{status}[/green]"
-        elif status in ("failed", "superseded"):
-            status_display = f"[red]{status}[/red]"
-        elif status == "pending-upgrade":
-            status_display = f"[yellow]{status}[/yellow]"
-        else:
-            status_display = status
-
-        table.add_row(str(revision), updated, status_display, chart, description)
-
-    console.print(table)
-
-    # Show rollback hint
-    if len(history_data) > 1:
-        console.print(
-            "\n[dim]To rollback: uv run api-forge-cli deploy rollback <revision>[/dim]"
-        )
diff --git a/src/cli/deployment/helm_deployer/cleanup.py b/src/cli/deployment/helm_deployer/cleanup.py
index 098d405..f4983f9 100644
--- a/src/cli/deployment/helm_deployer/cleanup.py
+++ b/src/cli/deployment/helm_deployer/cleanup.py
@@ -8,7 +8,9 @@
 
 from typing import TYPE_CHECKING
 
-from ..shell_commands import ReplicaSetInfo, calculate_replicaset_age_hours
+from src.infra.k8s.controller import ReplicaSetInfo
+
+from ..shell_commands import calculate_replicaset_age_hours
 from .constants import DeploymentConstants
 
 if TYPE_CHECKING:
diff --git a/src/cli/deployment/helm_deployer/deployer.py b/src/cli/deployment/helm_deployer/deployer.py
index c35f574..39da86a 100644
--- a/src/cli/deployment/helm_deployer/deployer.py
+++ b/src/cli/deployment/helm_deployer/deployer.py
@@ -167,6 +167,8 @@ def deploy(
         ingress_enabled: bool = False,
         ingress_host: str | None = None,
         ingress_tls_secret: str | None = None,
+        ingress_tls_auto: bool = False,
+        ingress_tls_staging: bool = False,
         **kwargs: Any,
     ) -> None:
         """Deploy to Kubernetes cluster.
@@ -187,7 +189,9 @@ def deploy(
             registry: Container registry for remote clusters
             ingress_enabled: Whether to enable Ingress for external access
             ingress_host: Hostname for Ingress (e.g., api.example.com)
-            ingress_tls_secret: TLS secret name for HTTPS
+            ingress_tls_secret: TLS secret name for HTTPS (manual)
+            ingress_tls_auto: Auto-provision TLS via cert-manager
+            ingress_tls_staging: Use staging Let's Encrypt (with ingress_tls_auto)
             **kwargs: Reserved for future options
         """
         if not self.check_env_file():
@@ -255,6 +259,8 @@ def deploy(
             ingress_enabled=ingress_enabled,
             ingress_host=ingress_host,
             ingress_tls_secret=ingress_tls_secret,
+            ingress_tls_auto=ingress_tls_auto,
+            ingress_tls_staging=ingress_tls_staging,
         )
         self.helm_release.deploy_release(namespace, image_override_file)
 
diff --git a/src/cli/deployment/helm_deployer/helm_release.py b/src/cli/deployment/helm_deployer/helm_release.py
index 5a8e9b3..e99f3e3 100644
--- a/src/cli/deployment/helm_deployer/helm_release.py
+++ b/src/cli/deployment/helm_deployer/helm_release.py
@@ -58,6 +58,8 @@ def create_image_override_file(
         ingress_enabled: bool = False,
         ingress_host: str | None = None,
         ingress_tls_secret: str | None = None,
+        ingress_tls_auto: bool = False,
+        ingress_tls_staging: bool = False,
     ) -> Path:
         """Create a temporary values file to override image tags and ingress.
 
@@ -66,7 +68,9 @@ def create_image_override_file(
             registry: Optional container registry prefix for remote clusters
             ingress_enabled: Whether to enable Ingress for external access
             ingress_host: Hostname for Ingress (e.g., api.example.com)
-            ingress_tls_secret: TLS secret name for HTTPS
+            ingress_tls_secret: TLS secret name for HTTPS (manual certificate)
+            ingress_tls_auto: Auto-provision TLS via cert-manager
+            ingress_tls_staging: Use staging Let's Encrypt (with ingress_tls_auto)
 
         Returns:
             Path to the temporary override file
@@ -102,17 +106,34 @@ def create_image_override_file(
                 {"host": host, "paths": [{"path": "/", "pathType": "Prefix"}]}
             ]
 
-            # Add TLS configuration if secret is provided
-            if ingress_tls_secret:
+            tls_info = ""
+
+            # Handle automatic TLS via cert-manager
+            if ingress_tls_auto:
+                issuer_name = (
+                    "letsencrypt-staging" if ingress_tls_staging else "letsencrypt-prod"
+                )
+                # Add cert-manager annotation
+                ingress_config["annotations"] = {
+                    "cert-manager.io/cluster-issuer": issuer_name
+                }
+                # Generate secret name from hostname (sanitize for K8s naming)
+                auto_secret_name = host.replace(".", "-") + "-tls"
+                ingress_config["tls"] = [
+                    {"secretName": auto_secret_name, "hosts": [host]}
+                ]
+                tls_info = f" (TLS: auto via {issuer_name})"
+            # Add TLS configuration if manual secret is provided
+            elif ingress_tls_secret:
                 ingress_config["tls"] = [
                     {"secretName": ingress_tls_secret, "hosts": [host]}
                 ]
+                tls_info = f" (TLS: {ingress_tls_secret})"
 
             override_values["app"]["ingress"] = ingress_config
 
             self.console.print(
-                f"[bold cyan]🌐 Ingress enabled:[/bold cyan] {host}"
-                + (f" (TLS: {ingress_tls_secret})" if ingress_tls_secret else "")
+                f"[bold cyan]🌐 Ingress enabled:[/bold cyan] {host}{tls_info}"
             )
 
         temp_file = Path(tempfile.mktemp(suffix=".yaml", prefix="helm-image-override-"))
diff --git a/src/cli/deployment/helm_deployer/validator.py b/src/cli/deployment/helm_deployer/validator.py
index d9040f8..d46c55e 100644
--- a/src/cli/deployment/helm_deployer/validator.py
+++ b/src/cli/deployment/helm_deployer/validator.py
@@ -16,6 +16,8 @@
 from enum import Enum
 from typing import TYPE_CHECKING
 
+from src.infra.k8s.controller import PodInfo
+
 if TYPE_CHECKING:
     from rich.console import Console
 
@@ -302,8 +304,8 @@ def _check_failed_jobs(self, namespace: str, result: ValidationResult) -> None:
         jobs = self.commands.kubectl.get_jobs(namespace)
 
         for job in jobs:
-            job_name = job["name"]
-            job_status = job.get("status")
+            job_name = job.name
+            job_status = job.status
 
             # If job succeeded, it's fine - ignore any previous failures
             if job_status == "Complete":
@@ -337,9 +339,9 @@ def _check_crashloop_pods(self, namespace: str, result: ValidationResult) -> Non
         pods = self.commands.kubectl.get_pods(namespace)
 
         for pod in pods:
-            if pod.get("status") == "CrashLoopBackOff":
-                pod_name = str(pod["name"])
-                restarts = pod.get("restarts", 0)
+            if pod.status == "CrashLoopBackOff":
+                pod_name = pod.name
+                restarts = pod.restarts
                 result.issues.append(
                     ValidationIssue(
                         severity=ValidationSeverity.ERROR,
@@ -362,8 +364,8 @@ def _check_pending_pods(self, namespace: str, result: ValidationResult) -> None:
         pods = self.commands.kubectl.get_pods(namespace)
 
         for pod in pods:
-            if pod.get("status") == "Pending":
-                pod_name = str(pod["name"])
+            if pod.status == "Pending":
+                pod_name = pod.name
                 # Check if it's been pending for a while (ignore recently created)
                 # For now, treat all Pending as warnings
                 result.issues.append(
@@ -392,11 +394,11 @@ def _check_error_pods(self, namespace: str, result: ValidationResult) -> None:
         pods = self.commands.kubectl.get_pods(namespace)
 
         # Group job-owned pods by their job name
-        job_pods: dict[str, list[dict[str, str | int]]] = {}
-        non_job_pods: list[dict[str, str | int]] = []
+        job_pods: dict[str, list[PodInfo]] = {}
+        non_job_pods: list[PodInfo] = []
 
         for pod in pods:
-            job_owner = str(pod.get("jobOwner", ""))
+            job_owner = pod.job_owner
             if job_owner:
                 if job_owner not in job_pods:
                     job_pods[job_owner] = []
@@ -406,8 +408,8 @@ def _check_error_pods(self, namespace: str, result: ValidationResult) -> None:
 
         # Check non-job pods for errors (these are always relevant)
         for pod in non_job_pods:
-            if pod.get("status") == "Error":
-                pod_name = str(pod["name"])
+            if pod.status == "Error":
+                pod_name = pod.name
                 result.issues.append(
                     ValidationIssue(
                         severity=ValidationSeverity.ERROR,
@@ -431,7 +433,7 @@ def _check_error_pods(self, namespace: str, result: ValidationResult) -> None:
             # ISO 8601 timestamps sort correctly as strings
             sorted_pods = sorted(
                 pods_list,
-                key=lambda p: str(p.get("creationTimestamp", "")),
+                key=lambda p: p.creation_timestamp,
                 reverse=True,
             )
 
@@ -439,12 +441,12 @@ def _check_error_pods(self, namespace: str, result: ValidationResult) -> None:
                 continue
 
             most_recent_pod = sorted_pods[0]
-            pod_status = most_recent_pod.get("status")
+            pod_status = most_recent_pod.status
 
             # Only flag if the most recent pod is in Error state
             # Completed/Succeeded pods are fine, older failed pods are irrelevant
             if pod_status == "Error":
-                pod_name = str(most_recent_pod["name"])
+                pod_name = most_recent_pod.name
                 result.issues.append(
                     ValidationIssue(
                         severity=ValidationSeverity.WARNING,
diff --git a/src/cli/deployment/shell_commands/kubectl.py b/src/cli/deployment/shell_commands/kubectl.py
index 575e3d1..71a8070 100644
--- a/src/cli/deployment/shell_commands/kubectl.py
+++ b/src/cli/deployment/shell_commands/kubectl.py
@@ -1,16 +1,23 @@
 """Kubectl command abstractions.
 
 This module provides commands for Kubernetes resource management via kubectl,
-organized into logical groups for namespaces, deployments, ReplicaSets, and pods.
+delegating to Kr8sController for the actual operations.
+
+This is a sync wrapper around the async Kr8sController for backward
+compatibility with existing code.
 """
 
 from __future__ import annotations
 
-import json
-from datetime import datetime
 from typing import TYPE_CHECKING
 
-from .types import CommandResult, ReplicaSetInfo
+from src.infra.k8s import Kr8sController, run_sync
+from src.infra.k8s.controller import (
+    CommandResult,
+    JobInfo,
+    PodInfo,
+    ReplicaSetInfo,
+)
 
 if TYPE_CHECKING:
     from .runner import CommandRunner
@@ -19,6 +26,10 @@
 class KubectlCommands:
     """Kubectl-related shell commands.
 
+    This is a sync wrapper around Kr8sController that provides backward
+    compatibility with existing code. All methods delegate to the async
+    controller using run_sync().
+
     Provides operations for:
     - Cluster context detection
     - Namespace management
@@ -32,58 +43,32 @@ def __init__(self, runner: CommandRunner) -> None:
         """Initialize kubectl commands.
 
         Args:
-            runner: Command runner for executing shell commands
+            runner: Command runner (kept for interface compatibility, not used)
         """
+        # Keep runner reference for interface compatibility
         self._runner = runner
+        # Delegate to the async controller
+        self._controller = Kr8sController()
 
     # =========================================================================
     # Cluster Context
     # =========================================================================
 
     def is_minikube_context(self) -> bool:
-        """Check if the current kubectl context is Minikube.
-
-        Returns:
-            True if current context is minikube, False otherwise
-        """
-        result = self._runner.run(
-            ["kubectl", "config", "current-context"],
-            capture_output=True,
-        )
-        if not result.success:
-            return False
-        return "minikube" in result.stdout.strip().lower()
+        """Check if the current kubectl context is Minikube."""
+        return run_sync(self._controller.is_minikube_context())
 
     def get_current_context(self) -> str:
-        """Get the current kubectl context name.
-
-        Returns:
-            Context name, or "unknown" if detection fails
-        """
-        result = self._runner.run(
-            ["kubectl", "config", "current-context"],
-            capture_output=True,
-        )
-        return result.stdout.strip() if result.success else "unknown"
+        """Get the current kubectl context name."""
+        return run_sync(self._controller.get_current_context())
 
     # =========================================================================
     # Namespace Management
     # =========================================================================
 
     def namespace_exists(self, namespace: str) -> bool:
-        """Check if a namespace exists.
-
-        Args:
-            namespace: Namespace to check
-
-        Returns:
-            True if the namespace exists, False otherwise
-        """
-        result = self._runner.run(
-            ["kubectl", "get", "namespace", namespace],
-            capture_output=True,
-        )
-        return result.success
+        """Check if a namespace exists."""
+        return run_sync(self._controller.namespace_exists(namespace))
 
     def delete_namespace(
         self,
@@ -92,38 +77,14 @@ def delete_namespace(
         wait: bool = True,
         timeout: str = "120s",
     ) -> CommandResult:
-        """Delete a Kubernetes namespace and all its resources.
-
-        Warning: This is a destructive operation that deletes all resources
-        in the namespace.
-
-        Args:
-            namespace: Namespace to delete
-            wait: Whether to wait for deletion to complete
-            timeout: Maximum time to wait
-
-        Returns:
-            CommandResult with deletion status
-        """
-        cmd = ["kubectl", "delete", "namespace", namespace]
-        if wait:
-            cmd.append("--wait=true")
-            cmd.extend(["--timeout", timeout])
-        return self._runner.run(cmd)
+        """Delete a Kubernetes namespace and all its resources."""
+        return run_sync(
+            self._controller.delete_namespace(namespace, wait=wait, timeout=timeout)
+        )
 
     def delete_pvcs(self, namespace: str) -> CommandResult:
-        """Delete all PersistentVolumeClaims in a namespace.
-
-        Args:
-            namespace: Kubernetes namespace
-
-        Returns:
-            CommandResult with deletion status
-        """
-        return self._runner.run(
-            ["kubectl", "delete", "pvc", "--all", "-n", namespace],
-            capture_output=True,
-        )
+        """Delete all PersistentVolumeClaims in a namespace."""
+        return run_sync(self._controller.delete_pvcs(namespace))
 
     # =========================================================================
     # Resource Deletion
@@ -137,138 +98,36 @@ def delete_resources_by_label(
         *,
         force: bool = False,
     ) -> CommandResult:
-        """Delete Kubernetes resources matching a label selector.
-
-        Args:
-            resource_types: Comma-separated resource types
-                           (e.g., "all,configmap,secret")
-            namespace: Kubernetes namespace
-            label_selector: Label selector
-                           (e.g., "app.kubernetes.io/instance=my-app")
-            force: Whether to force delete (bypass graceful deletion)
-
-        Returns:
-            CommandResult with deletion status
-        """
-        cmd = [
-            "kubectl",
-            "delete",
-            resource_types,
-            "-n",
-            namespace,
-            "-l",
-            label_selector,
-        ]
-        if force:
-            cmd.extend(["--force", "--grace-period=0"])
-        return self._runner.run(cmd)
+        """Delete Kubernetes resources matching a label selector."""
+        return run_sync(
+            self._controller.delete_resources_by_label(
+                resource_types, namespace, label_selector, force=force
+            )
+        )
 
     def delete_helm_secrets(
         self,
         namespace: str,
         release_name: str,
     ) -> CommandResult:
-        """Delete Helm release metadata secrets.
-
-        This is useful for cleaning up stuck Helm releases that can't
-        be uninstalled normally.
-
-        Args:
-            namespace: Kubernetes namespace
-            release_name: Helm release name
-
-        Returns:
-            CommandResult with deletion status
-        """
-        return self._runner.run(
-            [
-                "kubectl",
-                "delete",
-                "secret",
-                "-n",
-                namespace,
-                "-l",
-                f"name={release_name},owner=helm",
-            ]
-        )
+        """Delete Helm release metadata secrets."""
+        return run_sync(self._controller.delete_helm_secrets(namespace, release_name))
 
     # =========================================================================
     # ReplicaSet Operations
     # =========================================================================
 
     def get_replicasets(self, namespace: str) -> list[ReplicaSetInfo]:
-        """Get all ReplicaSets in a namespace.
-
-        Args:
-            namespace: Kubernetes namespace
-
-        Returns:
-            List of ReplicaSetInfo objects with parsed metadata
-        """
-        result = self._runner.run(
-            ["kubectl", "get", "replicasets", "-n", namespace, "-o", "json"]
-        )
-        if not result.success or not result.stdout:
-            return []
-
-        try:
-            data = json.loads(result.stdout)
-            replicasets = []
-
-            for rs in data.get("items", []):
-                metadata = rs.get("metadata", {})
-                spec = rs.get("spec", {})
-                annotations = metadata.get("annotations", {})
-                owner_refs = metadata.get("ownerReferences", [])
-
-                # Parse creation timestamp
-                created_at = None
-                if creation_ts := metadata.get("creationTimestamp"):
-                    try:
-                        created_at = datetime.fromisoformat(
-                            creation_ts.replace("Z", "+00:00")
-                        )
-                    except ValueError:
-                        pass
-
-                # Get owner deployment name
-                owner_deployment = None
-                if owner_refs:
-                    owner_deployment = owner_refs[0].get("name")
-
-                replicasets.append(
-                    ReplicaSetInfo(
-                        name=metadata.get("name", ""),
-                        replicas=spec.get("replicas", 0),
-                        revision=annotations.get(
-                            "deployment.kubernetes.io/revision", ""
-                        ),
-                        created_at=created_at,
-                        owner_deployment=owner_deployment,
-                    )
-                )
-
-            return replicasets
-        except json.JSONDecodeError:
-            return []
+        """Get all ReplicaSets in a namespace."""
+        return run_sync(self._controller.get_replicasets(namespace))
 
     def delete_replicaset(
         self,
         name: str,
         namespace: str,
     ) -> CommandResult:
-        """Delete a specific ReplicaSet.
-
-        Args:
-            name: ReplicaSet name
-            namespace: Kubernetes namespace
-
-        Returns:
-            CommandResult with deletion status
-        """
-        return self._runner.run(
-            ["kubectl", "delete", "replicaset", name, "-n", namespace]
-        )
+        """Delete a specific ReplicaSet."""
+        return run_sync(self._controller.delete_replicaset(name, namespace))
 
     def scale_replicaset(
         self,
@@ -276,55 +135,16 @@ def scale_replicaset(
         namespace: str,
         replicas: int,
     ) -> CommandResult:
-        """Scale a ReplicaSet to a specific number of replicas.
-
-        Args:
-            name: ReplicaSet name
-            namespace: Kubernetes namespace
-            replicas: Desired number of replicas
-
-        Returns:
-            CommandResult with scale status
-        """
-        return self._runner.run(
-            [
-                "kubectl",
-                "scale",
-                "replicaset",
-                name,
-                f"--replicas={replicas}",
-                "-n",
-                namespace,
-            ]
-        )
+        """Scale a ReplicaSet to a specific number of replicas."""
+        return run_sync(self._controller.scale_replicaset(name, namespace, replicas))
 
     # =========================================================================
     # Deployment Operations
     # =========================================================================
 
     def get_deployments(self, namespace: str) -> list[str]:
-        """Get list of deployment names in a namespace.
-
-        Args:
-            namespace: Kubernetes namespace
-
-        Returns:
-            List of deployment names
-        """
-        result = self._runner.run(
-            [
-                "kubectl",
-                "get",
-                "deployments",
-                "-n",
-                namespace,
-                "-o",
-                "jsonpath={.items[*].metadata.name}",
-            ]
-        )
-        if not result.success or not result.stdout:
-            return []
-        return result.stdout.strip().split()
+        """Get list of deployment names in a namespace."""
+        return run_sync(self._controller.get_deployments(namespace))
 
     def rollout_restart(
         self,
@@ -332,35 +152,10 @@ def rollout_restart(
         namespace: str,
         name: str | None = None,
     ) -> CommandResult:
-        """Trigger a rolling restart of a deployment/daemonset/statefulset.
-
-        Args:
-            resource_type: Resource type ("deployment", "daemonset", "statefulset")
-            namespace: Kubernetes namespace
-            name: Specific resource name, or None to restart all of that type
-
-        Returns:
-            CommandResult with restart status
-
-        Example:
-            >>> # Restart all deployments
-            >>> kubectl.rollout_restart("deployment", "production")
-            >>> # Restart specific deployment
-            >>> kubectl.rollout_restart("deployment", "production", "api-server")
-        """
-        if name:
-            cmd = [
-                "kubectl",
-                "rollout",
-                "restart",
-                resource_type,
-                name,
-                "-n",
-                namespace,
-            ]
-        else:
-            cmd = ["kubectl", "rollout", "restart", resource_type, "-n", namespace]
-        return self._runner.run(cmd, capture_output=True)
+        """Trigger a rolling restart of a deployment/daemonset/statefulset."""
+        return run_sync(
+            self._controller.rollout_restart(resource_type, namespace, name)
+        )
 
     def rollout_status(
         self,
@@ -370,75 +165,20 @@ def rollout_status(
         *,
         timeout: str = "300s",
     ) -> CommandResult:
-        """Wait for a rollout to complete.
-
-        Blocks until the rollout finishes (all pods are ready) or times out.
-
-        Args:
-            resource_type: Resource type ("deployment", "daemonset", "statefulset")
-            namespace: Kubernetes namespace
-            name: Specific resource name, or None to wait for all of that type
-            timeout: Maximum time to wait for rollout to complete
-
-        Returns:
-            CommandResult with rollout status
-
-        Example:
-            >>> # Wait for all deployments to be ready
-            >>> kubectl.rollout_status("deployment", "production")
-            >>> # Wait for specific deployment
-            >>> kubectl.rollout_status("deployment", "production", "api-server")
-        """
-        if name:
-            cmd = [
-                "kubectl",
-                "rollout",
-                "status",
-                resource_type,
-                name,
-                "-n",
-                namespace,
-                f"--timeout={timeout}",
-            ]
-        else:
-            cmd = [
-                "kubectl",
-                "rollout",
-                "status",
-                resource_type,
-                "-n",
-                namespace,
-                f"--timeout={timeout}",
-            ]
-        return self._runner.run(cmd, capture_output=False)
+        """Wait for a rollout to complete."""
+        return run_sync(
+            self._controller.rollout_status(
+                resource_type, namespace, name, timeout=timeout
+            )
+        )
 
     def get_deployment_revision(
         self,
         name: str,
         namespace: str,
     ) -> str | None:
-        """Get the current revision number of a deployment.
-
-        Args:
-            name: Deployment name
-            namespace: Kubernetes namespace
-
-        Returns:
-            Revision number as string, or None if not found
-        """
-        result = self._runner.run(
-            [
-                "kubectl",
-                "get",
-                "deployment",
-                name,
-                "-n",
-                namespace,
-                "-o",
-                "jsonpath={.metadata.annotations.deployment\\.kubernetes\\.io/revision}",
-            ]
-        )
-        return result.stdout.strip() if result.success and result.stdout else None
+        """Get the current revision number of a deployment."""
+        return run_sync(self._controller.get_deployment_revision(name, namespace))
 
     # =========================================================================
     # Pod Operations
@@ -452,153 +192,29 @@ def wait_for_pods(
         condition: str = "ready",
         timeout: str = "300s",
     ) -> CommandResult:
-        """Wait for pods matching a selector to reach a condition.
-
-        Args:
-            namespace: Kubernetes namespace
-            label_selector: Label selector for pods
-            condition: Condition to wait for (e.g., "ready", "delete")
-            timeout: Maximum time to wait
-
-        Returns:
-            CommandResult with wait status
-
-        Example:
-            >>> kubectl.wait_for_pods(
-            ...     "production",
-            ...     "app.kubernetes.io/component=application",
-            ...     timeout="120s",
-            ... )
-        """
-        return self._runner.run(
-            [
-                "kubectl",
-                "wait",
-                "--for",
-                f"condition={condition}",
-                "pod",
-                "-l",
-                label_selector,
-                "-n",
-                namespace,
-                f"--timeout={timeout}",
-            ],
-            capture_output=False,
+        """Wait for pods matching a selector to reach a condition."""
+        return run_sync(
+            self._controller.wait_for_pods(
+                namespace, label_selector, condition=condition, timeout=timeout
+            )
         )
 
-    def get_pods(self, namespace: str) -> list[dict[str, str | int]]:
+    def get_pods(self, namespace: str) -> list[PodInfo]:
         """Get all pods in a namespace with their status.
 
-        Args:
-            namespace: Kubernetes namespace
-
-        Returns:
-            List of dicts with pod name, status, restarts, creation timestamp,
-            and job owner (if pod is owned by a Job)
+        Note: Return type changed from list[dict] to list[PodInfo].
+        Access fields as attributes: pod.name, pod.status, etc.
         """
-        result = self._runner.run(
-            ["kubectl", "get", "pods", "-n", namespace, "-o", "json"],
-            capture_output=True,
-        )
-        if not result.success or not result.stdout:
-            return []
-
-        try:
-            data = json.loads(result.stdout)
-            pods = []
-
-            for pod in data.get("items", []):
-                metadata = pod.get("metadata", {})
-                name = metadata.get("name", "")
-                creation_timestamp = metadata.get("creationTimestamp", "")
-                status = pod.get("status", {})
-
-                # Check if pod is owned by a Job
-                job_owner = ""
-                for owner_ref in metadata.get("ownerReferences", []):
-                    if owner_ref.get("kind") == "Job":
-                        job_owner = owner_ref.get("name", "")
-                        break
-
-                # Determine pod status
-                phase = status.get("phase", "Unknown")
-                container_statuses = status.get("containerStatuses", [])
-
-                # Check for specific states
-                pod_status = phase
-                restarts = 0
-
-                for cs in container_statuses:
-                    restarts += cs.get("restartCount", 0)
-                    state = cs.get("state", {})
-                    if "waiting" in state:
-                        reason = state["waiting"].get("reason", "")
-                        if reason:
-                            pod_status = reason  # e.g., CrashLoopBackOff
-                    elif "terminated" in state:
-                        reason = state["terminated"].get("reason", "")
-                        if reason == "Error":
-                            pod_status = "Error"
-
-                pods.append(
-                    {
-                        "name": name,
-                        "status": pod_status,
-                        "restarts": restarts,
-                        "creationTimestamp": creation_timestamp,
-                        "jobOwner": job_owner,
-                    }
-                )
-
-            return pods
-        except json.JSONDecodeError:
-            return []
+        return run_sync(self._controller.get_pods(namespace))
 
     # =========================================================================
     # Job Operations
     # =========================================================================
 
-    def get_jobs(self, namespace: str) -> list[dict[str, str]]:
+    def get_jobs(self, namespace: str) -> list[JobInfo]:
         """Get all jobs in a namespace with their status.
 
-        Args:
-            namespace: Kubernetes namespace
-
-        Returns:
-            List of dicts with job name and status (Running/Complete/Failed)
+        Note: Return type changed from list[dict] to list[JobInfo].
+        Access fields as attributes: job.name, job.status
         """
-        result = self._runner.run(
-            ["kubectl", "get", "jobs", "-n", namespace, "-o", "json"],
-            capture_output=True,
-        )
-        if not result.success or not result.stdout:
-            return []
-
-        try:
-            data = json.loads(result.stdout)
-            jobs = []
-
-            for job in data.get("items", []):
-                name = job.get("metadata", {}).get("name", "")
-                status = job.get("status", {})
-
-                # Determine job status
-                if status.get("succeeded", 0) > 0:
-                    job_status = "Complete"
-                elif status.get("failed", 0) > 0:
-                    job_status = "Failed"
-                elif status.get("active", 0) > 0:
-                    job_status = "Running"
-                else:
-                    job_status = "Unknown"
-
-                jobs.append(
-                    {
-                        "name": name,
-                        "status": job_status,
-                    }
-                )
-
-            return jobs
-        except json.JSONDecodeError:
-            return []
+        return run_sync(self._controller.get_jobs(namespace))
diff --git a/src/cli/deployment/shell_commands/types.py b/src/cli/deployment/shell_commands/types.py
index 03391bb..1f7a5f3 100644
--- a/src/cli/deployment/shell_commands/types.py
+++ b/src/cli/deployment/shell_commands/types.py
@@ -2,6 +2,9 @@
 
 This module contains all dataclasses and type definitions used across
 the shell command modules.
+
+Note: CommandResult and ReplicaSetInfo are re-exported from src.infra.k8s.controller
+for backward compatibility. New code should import directly from there.
 """
 
 from __future__ import annotations
@@ -9,22 +12,16 @@
 from dataclasses import dataclass
 from datetime import UTC, datetime
 
+# Re-export Kubernetes types from canonical location
+from src.infra.k8s.controller import CommandResult, ReplicaSetInfo
 
-@dataclass
-class CommandResult:
-    """Result of a shell command execution.
-
-    Attributes:
-        success: Whether the command completed successfully (exit code 0)
-        stdout: Standard output from the command
-        stderr: Standard error from the command
-        returncode: The exit code of the command
-    """
-
-    success: bool
-    stdout: str
-    stderr: str
-    returncode: int
+__all__ = [
+    "CommandResult",
+    "ReplicaSetInfo",
+    "HelmRelease",
+    "GitStatus",
+    "calculate_replicaset_age_hours",
+]
 
 
 @dataclass
@@ -44,25 +41,6 @@ class HelmRelease:
     revision: str
 
 
-@dataclass
-class ReplicaSetInfo:
-    """Information about a Kubernetes ReplicaSet.
-
-    Attributes:
-        name: ReplicaSet name
-        replicas: Desired replica count
-        revision: Deployment revision annotation
-        created_at: Creation timestamp
-        owner_deployment: Name of the owning Deployment (if any)
-    """
-
-    name: str
-    replicas: int
-    revision: str
-    created_at: datetime | None
-    owner_deployment: str | None
-
-
 @dataclass
 class GitStatus:
     """Git repository status information.
diff --git a/src/cli/deployment/status_display.py b/src/cli/deployment/status_display.py
index aaeb2f9..008de23 100644
--- a/src/cli/deployment/status_display.py
+++ b/src/cli/deployment/status_display.py
@@ -1,7 +1,6 @@
 """Status display utilities for deployment environments."""
 
 import os
-import subprocess
 
 import requests  # type: ignore
 from dotenv.main import load_dotenv
@@ -15,10 +14,15 @@
     check_redis_status,
     check_temporal_status,
 )
+from src.infra.k8s import Kr8sController, run_sync
+from src.infra.k8s.controller import PodInfo, ServiceInfo
 
 from .health_checks import HealthChecker
 from .service_config import get_production_services, is_temporal_enabled
 
+# Module-level controller singleton
+_controller = Kr8sController()
+
 
 class StatusDisplay:
     """Utility class for displaying deployment status."""
@@ -88,34 +92,27 @@ def show_k8s_status(self, namespace: str = "api-forge-prod") -> None:
         Args:
             namespace: Kubernetes namespace to check
         """
-        self.console.print(
-            Panel.fit(
-                "[bold magenta]Kubernetes Deployment Status[/bold magenta]",
-                border_style="magenta",
+        # Note: Header is printed by the calling command, don't duplicate
+
+        # Get and format pod status
+        pods = run_sync(_controller.get_pods(namespace))
+        self.console.print("\n[bold cyan]Pods:[/bold cyan]")
+        if pods:
+            pods_output = self._format_pods_table(pods)
+            self.console.print(pods_output)
+        else:
+            self.console.print(f"  [dim]No pods found in namespace {namespace}[/dim]")
+
+        # Get and format service status
+        services = run_sync(_controller.get_services(namespace))
+        self.console.print("\n[bold cyan]Services:[/bold cyan]")
+        if services:
+            services_output = self._format_services_table(services)
+            self.console.print(services_output)
+        else:
+            self.console.print(
+                f"  [dim]No services found in namespace {namespace}[/dim]"
             )
-        )
-
-        # Get pod status
-        result = subprocess.run(
-            ["kubectl", "get", "pods", "-n", namespace, "-o", "wide"],
-            capture_output=True,
-            text=True,
-        )
-
-        if result.returncode == 0:
-            self.console.print("\n[bold cyan]Pods:[/bold cyan]")
-            self.console.print(result.stdout)
-
-        # Get service status
-        result = subprocess.run(
-            ["kubectl", "get", "svc", "-n", namespace],
-            capture_output=True,
-            text=True,
-        )
-
-        if result.returncode == 0:
-            self.console.print("\n[bold cyan]Services:[/bold cyan]")
-            self.console.print(result.stdout)
 
         self._show_k8s_access_instructions(namespace)
 
@@ -265,3 +262,55 @@ def _show_k8s_access_instructions(self, namespace: str) -> None:
         self.console.print(
             f"  └─ View logs: kubectl logs -n {namespace} -l app.kubernetes.io/name=app -f"
         )
+
+    def _format_pods_table(self, pods: list[PodInfo]) -> str:
+        """Format pods data into a kubectl-like table string.
+
+        Args:
+            pods: List of PodInfo objects
+
+        Returns:
+            Formatted table string similar to kubectl get pods -o wide
+        """
+        if not pods:
+            return ""
+
+        # Header row
+        header = f"{'NAME':<40} {'READY':<8} {'STATUS':<16} {'RESTARTS':<8} {'AGE':<8} {'IP':<15} {'NODE':<20}"
+        rows = [header]
+
+        for pod in pods:
+            # Determine ready status (simplified)
+            ready = "1/1" if pod.status in ["Running", "Succeeded"] else "0/1"
+
+            # Format age (simplified - just show timestamp for now)
+            age = pod.creation_timestamp[:10] if pod.creation_timestamp else ""
+
+            row = f"{pod.name:<40} {ready:<8} {pod.status:<16} {pod.restarts:<8} {age:<8} {pod.ip:<15} {pod.node:<20}"
+            rows.append(row)
+
+        return "\n".join(rows)
+
+    def _format_services_table(self, services: list[ServiceInfo]) -> str:
+        """Format services data into a kubectl-like table string.
+
+        Args:
+            services: List of ServiceInfo objects
+
+        Returns:
+            Formatted table string similar to kubectl get svc
+        """
+        if not services:
+            return ""
+
+        # Header row
+        header = f"{'NAME':<30} {'TYPE':<15} {'CLUSTER-IP':<15} {'EXTERNAL-IP':<15} {'PORT(S)':<20}"
+        rows = [header]
+
+        for svc in services:
+            external_ip = svc.external_ip if svc.external_ip else "<none>"
+
+            row = f"{svc.name:<30} {svc.type:<15} {svc.cluster_ip:<15} {external_ip:<15} {svc.ports:<20}"
+            rows.append(row)
+
+        return "\n".join(rows)
diff --git a/src/cli/utils.py b/src/cli/utils.py
deleted file mode 100644
index b6d0abb..0000000
--- a/src/cli/utils.py
+++ /dev/null
@@ -1,109 +0,0 @@
-"""Shared utilities for CLI commands."""
-
-import subprocess
-from pathlib import Path
-from typing import Any
-
-import typer
-from rich.console import Console
-from rich.panel import Panel
-
-# Initialize Rich console for colored output
-console = Console()
-
-
-def confirm_destructive_action(
-    action: str,
-    details: str | None = None,
-    extra_warning: str | None = None,
-    force: bool = False,
-) -> bool:
-    """Prompt user to confirm a destructive action.
-
-    Args:
-        action: Description of the action (e.g., "Stop all services")
-        details: Additional details about what will be affected
-        extra_warning: Extra warning message (e.g., for data loss)
-        force: If True, skip the confirmation prompt
-
-    Returns:
-        True if the user confirmed, False otherwise
-    """
-    if force:
-        return True
-
-    # Build warning message
-    warning_lines = [f"[bold red]⚠️  {action}[/bold red]"]
-
-    if details:
-        warning_lines.append(f"\n{details}")
-
-    if extra_warning:
-        warning_lines.append(f"\n[yellow]{extra_warning}[/yellow]")
-
-    console.print(
-        Panel(
-            "\n".join(warning_lines),
-            title="Confirmation Required",
-            border_style="red",
-        )
-    )
-
-    try:
-        # Escape brackets with backslash for Rich markup
-        response = console.input(
-            "\n[bold]Are you sure you want to proceed?[/bold] \\[y/N]: "
-        )
-        return response.strip().lower() in ("y", "yes")
-    except (KeyboardInterrupt, EOFError):
-        console.print("\n[dim]Cancelled.[/dim]")
-        return False
-
-
-def get_project_root() -> Path:
-    """Get the project root directory.
-
-    Walks up from the module location to find the project root,
-    identified by the presence of pyproject.toml.
-    """
-    current = Path(__file__).resolve()
-
-    # Walk up the directory tree looking for pyproject.toml
-    for parent in [current, *current.parents]:
-        if (parent / "pyproject.toml").exists():
-            return parent
-
-    # Fallback to three levels up (src/cli/utils.py -> project root)
-    return Path(__file__).parent.parent.parent
-
-
-def get_dev_dir() -> Path:
-    """Get the dev_env directory (infrastructure and Docker files)."""
-    project_root = get_project_root()
-    return project_root / "docker" / "dev"
-
-
-def run_command(
-    command: list[str],
-    cwd: Path | None = None,
-    check: bool = True,
-    capture_output: bool = False,
-) -> subprocess.CompletedProcess[Any]:
-    """Run a shell command with proper error handling."""
-    try:
-        result = subprocess.run(
-            command,
-            cwd=cwd or get_project_root(),
-            check=check,
-            capture_output=capture_output,
-            text=True,
-        )
-        return result
-    except subprocess.CalledProcessError as e:
-        console.print(f"[red]Command failed: {' '.join(command)}[/red]")
-        console.print(f"[red]Exit code: {e.returncode}[/red]")
-        if e.stdout:
-            console.print(f"[red]stdout: {e.stdout}[/red]")
-        if e.stderr:
-            console.print(f"[red]stderr: {e.stderr}[/red]")
-        raise typer.Exit(1) from e
diff --git a/src/infra/k8s/__init__.py b/src/infra/k8s/__init__.py
new file mode 100644
index 0000000..25a7c5e
--- /dev/null
+++ b/src/infra/k8s/__init__.py
@@ -0,0 +1,50 @@
+"""Kubernetes infrastructure abstraction layer.
+
+This module provides a clean abstraction over Kubernetes operations,
+supporting multiple backends (kubectl subprocess, kr8s library).
+
+Example:
+    from src.infra.k8s import KubernetesController, KubectlController, run_sync
+
+    # Create controller
+    controller = KubectlController()
+
+    # Use async methods in sync context
+    exists = run_sync(controller.namespace_exists("my-namespace"))
+    pods = run_sync(controller.get_pods("my-namespace"))
+
+    # Or use the kr8s-based controller for native async operations
+    from src.infra.k8s import Kr8sController
+
+    kr8s_controller = Kr8sController()
+    pods = run_sync(kr8s_controller.get_pods("my-namespace"))
+"""
+
+from .controller import (
+    ClusterIssuerStatus,
+    CommandResult,
+    JobInfo,
+    KubernetesController,
+    PodInfo,
+    ReplicaSetInfo,
+    ServiceInfo,
+)
+from .kr8s_controller import Kr8sController
+from .kubectl_controller import KubectlController
+from .utils import run_sync
+
+__all__ = [
+    # Controller classes
+    "KubernetesController",
+    "KubectlController",
+    "Kr8sController",
+    # Data classes
+    "CommandResult",
+    "PodInfo",
+    "ReplicaSetInfo",
+    "JobInfo",
+    "ServiceInfo",
+    "ClusterIssuerStatus",
+    # Utilities
+    "run_sync",
+]
diff --git a/src/infra/k8s/controller.py b/src/infra/k8s/controller.py
new file mode 100644
index 0000000..6997eff
--- /dev/null
+++ b/src/infra/k8s/controller.py
@@ -0,0 +1,494 @@
+"""Abstract Kubernetes controller interface.
+
+Defines the contract for Kubernetes operations that can be implemented
+by different backends (kubectl subprocess, kr8s library, etc.).
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+
+# =============================================================================
+# Data Types
+# =============================================================================
+
+
+@dataclass
+class CommandResult:
+    """Result of a command execution."""
+
+    success: bool
+    stdout: str = ""
+    stderr: str = ""
+    returncode: int = 0
+
+
+@dataclass
+class PodInfo:
+    """Information about a Kubernetes pod."""
+
+    name: str
+    status: str
+    restarts: int = 0
+    creation_timestamp: str = ""
+    job_owner: str = ""
+    ip: str = ""
+    node: str = ""
+
+
+@dataclass
+class ReplicaSetInfo:
+    """Information about a Kubernetes ReplicaSet."""
+
+    name: str
+    replicas: int
+    revision: str = ""
+    created_at: datetime | None = None
+    owner_deployment: str | None = None
+
+
+@dataclass
+class JobInfo:
+    """Information about a Kubernetes Job."""
+
+    name: str
+    status: str  # "Running", "Complete", "Failed", "Unknown"
+
+
+@dataclass
+class ServiceInfo:
+    """Information about a Kubernetes Service."""
+
+    name: str
+    type: str
+    cluster_ip: str
+    external_ip: str = ""
+    ports: str = ""
+
+
+@dataclass
+class ClusterIssuerStatus:
+    """Status of a cert-manager ClusterIssuer."""
+
+    exists: bool
+    ready: bool
+    message: str = ""
+
+
+# =============================================================================
+# Abstract Controller
+# =============================================================================
+
+
+class KubernetesController(ABC):
+    """Abstract base class for Kubernetes operations.
+
+    All methods are async to support both sync (kubectl) and async (kr8s)
+    implementations. Use `run_sync()` to call from synchronous code.
+
+    Example:
+        from src.infra.k8s import KubectlController, run_sync
+
+        controller = KubectlController()
+        pods = run_sync(controller.get_pods("my-namespace"))
+    """
+
+    # =========================================================================
+    # Cluster Context
+    # =========================================================================
+
+    @abstractmethod
+    async def get_current_context(self) -> str:
+        """Get the current kubectl context name.
+
+        Returns:
+            Context name, or "unknown" if detection fails
+        """
+        ...
+
+    @abstractmethod
+    async def is_minikube_context(self) -> bool:
+        """Check if the current kubectl context is Minikube.
+
+        Returns:
+            True if current context is minikube, False otherwise
+        """
+        ...
+
+    # =========================================================================
+    # Namespace Operations
+    # =========================================================================
+
+    @abstractmethod
+    async def namespace_exists(self, namespace: str) -> bool:
+        """Check if a namespace exists.
+
+        Args:
+            namespace: Namespace to check
+
+        Returns:
+            True if the namespace exists, False otherwise
+        """
+        ...
+
+    @abstractmethod
+    async def delete_namespace(
+        self,
+        namespace: str,
+        *,
+        wait: bool = True,
+        timeout: str = "120s",
+    ) -> CommandResult:
+        """Delete a Kubernetes namespace and all its resources.
+
+        Warning: This is a destructive operation.
+
+        Args:
+            namespace: Namespace to delete
+            wait: Whether to wait for deletion to complete
+            timeout: Maximum time to wait
+
+        Returns:
+            CommandResult with deletion status
+        """
+        ...
+
+    @abstractmethod
+    async def delete_pvcs(self, namespace: str) -> CommandResult:
+        """Delete all PersistentVolumeClaims in a namespace.
+
+        Args:
+            namespace: Kubernetes namespace
+
+        Returns:
+            CommandResult with deletion status
+        """
+        ...
+
+    # =========================================================================
+    # Resource Operations
+    # =========================================================================
+
+    @abstractmethod
+    async def apply_manifest(self, manifest_path: Path) -> CommandResult:
+        """Apply a Kubernetes manifest file.
+
+        Args:
+            manifest_path: Path to the YAML manifest file
+
+        Returns:
+            CommandResult with apply status
+        """
+        ...
+
+    @abstractmethod
+    async def delete_resources_by_label(
+        self,
+        resource_types: str,
+        namespace: str,
+        label_selector: str,
+        *,
+        force: bool = False,
+    ) -> CommandResult:
+        """Delete Kubernetes resources matching a label selector.
+
+        Args:
+            resource_types: Comma-separated resource types
+                           (e.g., "all,configmap,secret")
+            namespace: Kubernetes namespace
+            label_selector: Label selector
+                           (e.g., "app.kubernetes.io/instance=my-app")
+            force: Whether to force delete (bypass graceful deletion)
+
+        Returns:
+            CommandResult with deletion status
+        """
+        ...
+
+    @abstractmethod
+    async def delete_helm_secrets(
+        self,
+        namespace: str,
+        release_name: str,
+    ) -> CommandResult:
+        """Delete Helm release metadata secrets.
+
+        Useful for cleaning up stuck Helm releases.
+
+        Args:
+            namespace: Kubernetes namespace
+            release_name: Helm release name
+
+        Returns:
+            CommandResult with deletion status
+        """
+        ...
+
+    # =========================================================================
+    # Deployment Operations
+    # =========================================================================
+
+    @abstractmethod
+    async def get_deployments(self, namespace: str) -> list[str]:
+        """Get list of deployment names in a namespace.
+
+        Args:
+            namespace: Kubernetes namespace
+
+        Returns:
+            List of deployment names
+        """
+        ...
+
+    @abstractmethod
+    async def rollout_restart(
+        self,
+        resource_type: str,
+        namespace: str,
+        name: str | None = None,
+    ) -> CommandResult:
+        """Trigger a rolling restart of a deployment/daemonset/statefulset.
+
+        Args:
+            resource_type: Resource type ("deployment", "daemonset", "statefulset")
+            namespace: Kubernetes namespace
+            name: Specific resource name, or None to restart all of that type
+
+        Returns:
+            CommandResult with restart status
+        """
+        ...
+
+    @abstractmethod
+    async def rollout_status(
+        self,
+        resource_type: str,
+        namespace: str,
+        name: str | None = None,
+        *,
+        timeout: str = "300s",
+    ) -> CommandResult:
+        """Wait for a rollout to complete.
+
+        Blocks until the rollout finishes (all pods are ready) or times out.
+
+        Args:
+            resource_type: Resource type ("deployment", "daemonset", "statefulset")
+            namespace: Kubernetes namespace
+            name: Specific resource name, or None to wait for all of that type
+            timeout: Maximum time to wait for rollout to complete
+
+        Returns:
+            CommandResult with rollout status
+        """
+        ...
+
+    @abstractmethod
+    async def get_deployment_revision(
+        self,
+        name: str,
+        namespace: str,
+    ) -> str | None:
+        """Get the current revision number of a deployment.
+
+        Args:
+            name: Deployment name
+            namespace: Kubernetes namespace
+
+        Returns:
+            Revision number as string, or None if not found
+        """
+        ...
+
+    # =========================================================================
+    # ReplicaSet Operations
+    # =========================================================================
+
+    @abstractmethod
+    async def get_replicasets(self, namespace: str) -> list[ReplicaSetInfo]:
+        """Get all ReplicaSets in a namespace.
+
+        Args:
+            namespace: Kubernetes namespace
+
+        Returns:
+            List of ReplicaSetInfo objects with parsed metadata
+        """
+        ...
+
+    @abstractmethod
+    async def delete_replicaset(
+        self,
+        name: str,
+        namespace: str,
+    ) -> CommandResult:
+        """Delete a specific ReplicaSet.
+
+        Args:
+            name: ReplicaSet name
+            namespace: Kubernetes namespace
+
+        Returns:
+            CommandResult with deletion status
+        """
+        ...
+
+    @abstractmethod
+    async def scale_replicaset(
+        self,
+        name: str,
+        namespace: str,
+        replicas: int,
+    ) -> CommandResult:
+        """Scale a ReplicaSet to a specific number of replicas.
+
+        Args:
+            name: ReplicaSet name
+            namespace: Kubernetes namespace
+            replicas: Desired number of replicas
+
+        Returns:
+            CommandResult with scale status
+        """
+        ...
+
+    # =========================================================================
+    # Pod Operations
+    # =========================================================================
+
+    @abstractmethod
+    async def get_pods(self, namespace: str) -> list[PodInfo]:
+        """Get all pods in a namespace with their status.
+
+        Args:
+            namespace: Kubernetes namespace
+
+        Returns:
+            List of PodInfo objects with pod details
+        """
+        ...
+
+    @abstractmethod
+    async def wait_for_pods(
+        self,
+        namespace: str,
+        label_selector: str,
+        *,
+        condition: str = "ready",
+        timeout: str = "300s",
+    ) -> CommandResult:
+        """Wait for pods matching a selector to reach a condition.
+
+        Args:
+            namespace: Kubernetes namespace
+            label_selector: Label selector for pods
+            condition: Condition to wait for (e.g., "ready", "delete")
+            timeout: Maximum time to wait
+
+        Returns:
+            CommandResult with wait status
+        """
+        ...
+
+    @abstractmethod
+    async def get_pod_logs(
+        self,
+        namespace: str,
+        pod: str | None = None,
+        *,
+        container: str | None = None,
+        label_selector: str | None = None,
+        follow: bool = False,
+        tail: int = 100,
+        previous: bool = False,
+    ) -> CommandResult:
+        """Get logs from Kubernetes pods.
+
+        Args:
+            namespace: Kubernetes namespace
+            pod: Specific pod name, or None to use label_selector
+            container: Container name (if pod has multiple containers)
+            label_selector: Label selector for pods (if pod is None)
+            follow: Whether to follow log output
+            tail: Number of lines to show from the end
+            previous: Show logs from previous container instance
+
+        Returns:
+            CommandResult with logs in stdout
+        """
+        ...
+
+    # =========================================================================
+    # Job Operations
+    # =========================================================================
+
+    @abstractmethod
+    async def get_jobs(self, namespace: str) -> list[JobInfo]:
+        """Get all jobs in a namespace with their status.
+
+        Args:
+            namespace: Kubernetes namespace
+
+        Returns:
+            List of JobInfo objects
+        """
+        ...
+
+    # =========================================================================
+    # Service Operations
+    # =========================================================================
+
+    @abstractmethod
+    async def get_services(self, namespace: str) -> list[ServiceInfo]:
+        """Get all services in a namespace.
+
+        Args:
+            namespace: Kubernetes namespace
+
+        Returns:
+            List of ServiceInfo objects
+        """
+        ...
+
+    # =========================================================================
+    # Cert-Manager Operations
+    # =========================================================================
+
+    @abstractmethod
+    async def check_cert_manager_installed(self) -> bool:
+        """Check if cert-manager is installed in the cluster.
+
+        Returns:
+            True if cert-manager pods are running, False otherwise
+        """
+        ...
+
+    @abstractmethod
+    async def get_cluster_issuer_status(
+        self,
+        issuer_name: str,
+    ) -> ClusterIssuerStatus:
+        """Get the status of a cert-manager ClusterIssuer.
+
+        Args:
+            issuer_name: Name of the ClusterIssuer
+
+        Returns:
+            ClusterIssuerStatus with exists, ready, and message
+        """
+        ...
+
+    @abstractmethod
+    async def get_cluster_issuer_yaml(self, issuer_name: str) -> str | None:
+        """Get the YAML representation of a ClusterIssuer.
+
+        Args:
+            issuer_name: Name of the ClusterIssuer
+
+        Returns:
+            YAML string, or None if not found
+        """
+        ...
diff --git a/src/infra/k8s/kr8s_controller.py b/src/infra/k8s/kr8s_controller.py
new file mode 100644
index 0000000..5c0ff37
--- /dev/null
+++ b/src/infra/k8s/kr8s_controller.py
@@ -0,0 +1,764 @@
+"""Kr8s-based implementation of KubernetesController.
+
+Uses the kr8s library for native async Kubernetes operations.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import kr8s
+from kr8s.asyncio.objects import (
+    Deployment,
+    Job,
+    Namespace,
+    PersistentVolumeClaim,
+    Pod,
+    ReplicaSet,
+    Secret,
+    Service,
+)
+
+from .controller import (
+    ClusterIssuerStatus,
+    CommandResult,
+    JobInfo,
+    KubernetesController,
+    PodInfo,
+    ReplicaSetInfo,
+    ServiceInfo,
+)
+
+
+class Kr8sController(KubernetesController):
+    """Kubernetes controller using kr8s library.
+
+    All methods are natively async, leveraging kr8s's async API.
+
+    Note: The kr8s API client is NOT cached because it's tied to the event loop
+    that was running when created. When using run_sync() which calls asyncio.run(),
+    each call creates a new event loop, making the cached API unusable.
+    """
+
+    def __init__(self) -> None:
+        """Initialize the kr8s controller."""
+        # Note: We don't cache the API because kr8s clients are tied to
+        # the event loop they were created in. Since run_sync() uses
+        # asyncio.run() which creates/closes event loops, we need a fresh
+        # API client each time.
+        pass
+
+    async def _get_api(self) -> Any:  # Returns kr8s._api.Api
+        """Get or create the kr8s API client.
+
+        Creates a new API client each call because kr8s clients are bound
+        to the event loop they were created in.
+        """
+        return await kr8s.asyncio.api()
+
+    # =========================================================================
+    # Cluster Context
+    # =========================================================================
+
+    async def get_current_context(self) -> str:
+        """Get the current kubectl context name."""
+        try:
+            api = await self._get_api()
+            # Access context via auth object
+            return api.auth.active_context or "unknown"
+        except Exception:
+            return "unknown"
+
+    async def is_minikube_context(self) -> bool:
+        """Check if the current kubectl context is Minikube."""
+        context = await self.get_current_context()
+        return "minikube" in context.lower()
+
+    # =========================================================================
+    # Namespace Operations
+    # =========================================================================
+
+    async def namespace_exists(self, namespace: str) -> bool:
+        """Check if a namespace exists."""
+        try:
+            api = await self._get_api()
+            ns = await Namespace.get(namespace, api=api)
+            return ns is not None
+        except kr8s.NotFoundError:
+            return False
+        except Exception:
+            return False
+
+    async def delete_namespace(
+        self,
+        namespace: str,
+        *,
+        wait: bool = True,
+        timeout: str = "120s",
+    ) -> CommandResult:
+        """Delete a Kubernetes namespace and all its resources."""
+        try:
+            api = await self._get_api()
+            ns = await Namespace.get(namespace, api=api)
+            await ns.delete()
+
+            if wait:
+                # Parse timeout
+                timeout_seconds = self._parse_timeout(timeout)
+                try:
+                    await asyncio.wait_for(
+                        self._wait_for_namespace_deletion(namespace),
+                        timeout=timeout_seconds,
+                    )
+                except TimeoutError:
+                    return CommandResult(
+                        success=False,
+                        stderr=f"Timeout waiting for namespace {namespace} deletion",
+                        returncode=1,
+                    )
+
+            return CommandResult(
+                success=True, stdout=f'namespace "{namespace}" deleted'
+            )
+        except kr8s.NotFoundError:
+            return CommandResult(
+                success=False,
+                stderr=f'namespace "{namespace}" not found',
+                returncode=1,
+            )
+        except Exception as e:
+            return CommandResult(success=False, stderr=str(e), returncode=1)
+
+    async def _wait_for_namespace_deletion(self, namespace: str) -> None:
+        """Wait until a namespace no longer exists."""
+        while await self.namespace_exists(namespace):
+            await asyncio.sleep(1)
+
+    async def delete_pvcs(self, namespace: str) -> CommandResult:
+        """Delete all PersistentVolumeClaims in a namespace."""
+        try:
+            api = await self._get_api()
+            deleted = []
+            async for pvc in PersistentVolumeClaim.list(namespace=namespace, api=api):
+                await pvc.delete()
+                deleted.append(pvc.name)
+            return CommandResult(
+                success=True,
+                stdout=f"Deleted PVCs: {', '.join(deleted)}"
+                if deleted
+                else "No PVCs found",
+            )
+        except Exception as e:
+            return CommandResult(success=False, stderr=str(e), returncode=1)
+
+    # =========================================================================
+    # Resource Operations
+    # =========================================================================
+
+    async def apply_manifest(self, manifest_path: Path) -> CommandResult:
+        """Apply a Kubernetes manifest file.
+
+        Note: kr8s doesn't have a direct 'apply' equivalent, so we use
+        kubectl subprocess for this operation.
+        """
+        import subprocess
+
+        def _run() -> CommandResult:
+            result = subprocess.run(
+                ["kubectl", "apply", "-f", str(manifest_path)],
+                capture_output=True,
+                text=True,
+            )
+            return CommandResult(
+                success=result.returncode == 0,
+                stdout=result.stdout or "",
+                stderr=result.stderr or "",
+                returncode=result.returncode,
+            )
+
+        return await asyncio.to_thread(_run)
+
+    async def delete_resources_by_label(
+        self,
+        resource_types: str,
+        namespace: str,
+        label_selector: str,
+        *,
+        force: bool = False,
+    ) -> CommandResult:
+        """Delete Kubernetes resources matching a label selector.
+
+        Note: Uses kubectl for complex multi-resource deletion.
+        """
+        import subprocess
+
+        cmd = [
+            "kubectl",
+            "delete",
+            resource_types,
+            "-n",
+            namespace,
+            "-l",
+            label_selector,
+        ]
+        if force:
+            cmd.extend(["--force", "--grace-period=0"])
+
+        def _run() -> CommandResult:
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            return CommandResult(
+                success=result.returncode == 0,
+                stdout=result.stdout or "",
+                stderr=result.stderr or "",
+                returncode=result.returncode,
+            )
+
+        return await asyncio.to_thread(_run)
+
+    async def delete_helm_secrets(
+        self,
+        namespace: str,
+        release_name: str,
+    ) -> CommandResult:
+        """Delete Helm release metadata secrets."""
+        try:
+            api = await self._get_api()
+            deleted = []
+            async for secret in Secret.list(
+                namespace=namespace,
+                label_selector=f"name={release_name},owner=helm",
+                api=api,
+            ):
+                await secret.delete()
+                deleted.append(secret.name)
+            return CommandResult(
+                success=True,
+                stdout=f"Deleted secrets: {', '.join(deleted)}"
+                if deleted
+                else "No secrets found",
+            )
+        except Exception as e:
+            return CommandResult(success=False, stderr=str(e), returncode=1)
+
+    # =========================================================================
+    # Deployment Operations
+    # =========================================================================
+
+    async def get_deployments(self, namespace: str) -> list[str]:
+        """Get list of deployment names in a namespace."""
+        try:
+            api = await self._get_api()
+            return [d.name async for d in Deployment.list(namespace=namespace, api=api)]
+        except Exception:
+            return []
+
+    async def rollout_restart(
+        self,
+        resource_type: str,
+        namespace: str,
+        name: str | None = None,
+    ) -> CommandResult:
+        """Trigger a rolling restart of a deployment/daemonset/statefulset.
+
+        Note: kr8s doesn't have a direct rollout restart, using kubectl.
+        """
+        import subprocess
+
+        if name:
+            cmd = [
+                "kubectl",
+                "rollout",
+                "restart",
+                resource_type,
+                name,
+                "-n",
+                namespace,
+            ]
+        else:
+            cmd = ["kubectl", "rollout", "restart", resource_type, "-n", namespace]
+
+        def _run() -> CommandResult:
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            return CommandResult(
+                success=result.returncode == 0,
+                stdout=result.stdout or "",
+                stderr=result.stderr or "",
+                returncode=result.returncode,
+            )
+
+        return await asyncio.to_thread(_run)
+
+    async def rollout_status(
+        self,
+        resource_type: str,
+        namespace: str,
+        name: str | None = None,
+        *,
+        timeout: str = "300s",
+    ) -> CommandResult:
+        """Wait for a rollout to complete.
+
+        Note: Uses kubectl for streaming status output.
+        """
+        import subprocess
+
+        if name:
+            cmd = [
+                "kubectl",
+                "rollout",
+                "status",
+                resource_type,
+                name,
+                "-n",
+                namespace,
+                f"--timeout={timeout}",
+            ]
+        else:
+            cmd = [
+                "kubectl",
+                "rollout",
+                "status",
+                resource_type,
+                "-n",
+                namespace,
+                f"--timeout={timeout}",
+            ]
+
+        def _run() -> CommandResult:
+            result = subprocess.run(cmd, capture_output=False, text=True)
+            return CommandResult(
+                success=result.returncode == 0,
+                returncode=result.returncode,
+            )
+
+        return await asyncio.to_thread(_run)
+
+    async def get_deployment_revision(
+        self,
+        name: str,
+        namespace: str,
+    ) -> str | None:
+        """Get the current revision number of a deployment."""
+        try:
+            api = await self._get_api()
+            deployment = await Deployment.get(name, namespace=namespace, api=api)
+            annotations = deployment.metadata.get("annotations", {})
+            revision: str | None = annotations.get("deployment.kubernetes.io/revision")
+            return revision
+        except Exception:
+            return None
+
+    # =========================================================================
+    # ReplicaSet Operations
+    # =========================================================================
+
+    async def get_replicasets(self, namespace: str) -> list[ReplicaSetInfo]:
+        """Get all ReplicaSets in a namespace."""
+        try:
+            api = await self._get_api()
+            result = []
+
+            async for rs in ReplicaSet.list(namespace=namespace, api=api):
+                metadata = rs.metadata
+                spec = rs.spec
+                annotations = metadata.get("annotations", {})
+                owner_refs = metadata.get("ownerReferences", [])
+
+                # Parse creation timestamp
+                created_at = None
+                if creation_ts := metadata.get("creationTimestamp"):
+                    try:
+                        created_at = datetime.fromisoformat(
+                            creation_ts.replace("Z", "+00:00")
+                        )
+                    except ValueError:
+                        pass
+
+                # Get owner deployment name
+                owner_deployment = None
+                if owner_refs:
+                    owner_deployment = owner_refs[0].get("name")
+
+                result.append(
+                    ReplicaSetInfo(
+                        name=metadata.get("name", ""),
+                        replicas=spec.get("replicas", 0),
+                        revision=annotations.get(
+                            "deployment.kubernetes.io/revision", ""
+                        ),
+                        created_at=created_at,
+                        owner_deployment=owner_deployment,
+                    )
+                )
+
+            return result
+        except Exception:
+            return []
+
+    async def delete_replicaset(
+        self,
+        name: str,
+        namespace: str,
+    ) -> CommandResult:
+        """Delete a specific ReplicaSet."""
+        try:
+            api = await self._get_api()
+            rs = await ReplicaSet.get(name, namespace=namespace, api=api)
+            await rs.delete()
+            return CommandResult(success=True, stdout=f'replicaset "{name}" deleted')
+        except kr8s.NotFoundError:
+            return CommandResult(
+                success=False,
+                stderr=f'replicaset "{name}" not found',
+                returncode=1,
+            )
+        except Exception as e:
+            return CommandResult(success=False, stderr=str(e), returncode=1)
+
+    async def scale_replicaset(
+        self,
+        name: str,
+        namespace: str,
+        replicas: int,
+    ) -> CommandResult:
+        """Scale a ReplicaSet to a specific number of replicas."""
+        try:
+            api = await self._get_api()
+            rs = await ReplicaSet.get(name, namespace=namespace, api=api)
+            await rs.scale(replicas)
+            return CommandResult(
+                success=True,
+                stdout=f"replicaset/{name} scaled to {replicas}",
+            )
+        except Exception as e:
+            return CommandResult(success=False, stderr=str(e), returncode=1)
+
+    # =========================================================================
+    # Pod Operations
+    # =========================================================================
+
+    async def get_pods(self, namespace: str) -> list[PodInfo]:
+        """Get all pods in a namespace with their status."""
+        try:
+            api = await self._get_api()
+            result = []
+
+            async for pod in Pod.list(namespace=namespace, api=api):
+                metadata = pod.metadata
+                spec = pod.spec
+                status = pod.status
+
+                name = metadata.get("name", "")
+                creation_timestamp = metadata.get("creationTimestamp", "")
+
+                # Check if pod is owned by a Job
+                job_owner = ""
+                for owner_ref in metadata.get("ownerReferences", []):
+                    if owner_ref.get("kind") == "Job":
+                        job_owner = owner_ref.get("name", "")
+                        break
+
+                # Determine pod status
+                phase = status.get("phase", "Unknown")
+                container_statuses = status.get("containerStatuses", [])
+
+                pod_status = phase
+                restarts = 0
+
+                for cs in container_statuses:
+                    restarts += cs.get("restartCount", 0)
+                    state = cs.get("state", {})
+                    if "waiting" in state:
+                        reason = state["waiting"].get("reason", "")
+                        if reason:
+                            pod_status = reason
+                    elif "terminated" in state:
+                        reason = state["terminated"].get("reason", "")
+                        if reason == "Error":
+                            pod_status = "Error"
+
+                result.append(
+                    PodInfo(
+                        name=name,
+                        status=pod_status,
+                        restarts=restarts,
+                        creation_timestamp=creation_timestamp,
+                        job_owner=job_owner,
+                        ip=status.get("podIP", ""),
+                        node=spec.get("nodeName", ""),
+                    )
+                )
+
+            return result
+        except Exception:
+            return []
+
+    async def wait_for_pods(
+        self,
+        namespace: str,
+        label_selector: str,
+        *,
+        condition: str = "ready",
+        timeout: str = "300s",
+    ) -> CommandResult:
+        """Wait for pods matching a selector to reach a condition.
+
+        Note: Uses kubectl for the wait operation.
+        """
+        import subprocess
+
+        cmd = [
+            "kubectl",
+            "wait",
+            "--for",
+            f"condition={condition}",
+            "pod",
+            "-l",
+            label_selector,
+            "-n",
+            namespace,
+            f"--timeout={timeout}",
+        ]
+
+        def _run() -> CommandResult:
+            result = subprocess.run(cmd, capture_output=False, text=True)
+            return CommandResult(
+                success=result.returncode == 0,
+                returncode=result.returncode,
+            )
+
+        return await asyncio.to_thread(_run)
+
+    async def get_pod_logs(
+        self,
+        namespace: str,
+        pod: str | None = None,
+        *,
+        container: str | None = None,
+        label_selector: str | None = None,
+        follow: bool = False,
+        tail: int = 100,
+        previous: bool = False,
+    ) -> CommandResult:
+        """Get logs from Kubernetes pods.
+
+        Note: Uses kubectl for log streaming support.
+        """
+        import subprocess
+
+        cmd = ["kubectl", "logs", "-n", namespace]
+
+        if pod:
+            cmd.append(pod)
+        elif label_selector:
+            cmd.extend(["-l", label_selector, "--all-containers=true"])
+
+        if container:
+            cmd.extend(["-c", container])
+
+        if follow:
+            cmd.append("-f")
+
+        cmd.append(f"--tail={tail}")
+
+        if previous:
+            cmd.append("--previous")
+
+        def _run() -> CommandResult:
+            result = subprocess.run(
+                cmd,
+                capture_output=not follow,
+                text=True,
+            )
+            return CommandResult(
+                success=result.returncode == 0,
+                stdout=result.stdout or "" if not follow else "",
+                stderr=result.stderr or "" if not follow else "",
+                returncode=result.returncode,
+            )
+
+        return await asyncio.to_thread(_run)
+
+    # =========================================================================
+    # Job Operations
+    # =========================================================================
+
+    async def get_jobs(self, namespace: str) -> list[JobInfo]:
+        """Get all jobs in a namespace with their status."""
+        try:
+            api = await self._get_api()
+            result = []
+
+            async for job in Job.list(namespace=namespace, api=api):
+                name = job.metadata.get("name", "")
+                status = job.status
+
+                if status.get("succeeded", 0) > 0:
+                    job_status = "Complete"
+                elif status.get("failed", 0) > 0:
+                    job_status = "Failed"
+                elif status.get("active", 0) > 0:
+                    job_status = "Running"
+                else:
+                    job_status = "Unknown"
+
+                result.append(JobInfo(name=name, status=job_status))
+
+            return result
+        except Exception:
+            return []
+
+    # =========================================================================
+    # Service Operations
+    # =========================================================================
+
+    async def get_services(self, namespace: str) -> list[ServiceInfo]:
+        """Get all services in a namespace."""
+        try:
+            api = await self._get_api()
+            result = []
+
+            async for svc in Service.list(namespace=namespace, api=api):
+                metadata = svc.metadata
+                spec = svc.spec
+                status = svc.status
+
+                # Get external IP from LoadBalancer status
+                external_ip = ""
+                lb_ingress = status.get("loadBalancer", {}).get("ingress", [])
+                if lb_ingress:
+                    external_ip = lb_ingress[0].get(
+                        "ip", lb_ingress[0].get("hostname", "")
+                    )
+
+                # Format ports
+                ports = []
+                for port in spec.get("ports", []):
+                    port_str = f"{port.get('port')}"
+                    if target := port.get("targetPort"):
+                        port_str += f":{target}"
+                    if proto := port.get("protocol"):
+                        port_str += f"/{proto}"
+                    ports.append(port_str)
+
+                result.append(
+                    ServiceInfo(
+                        name=metadata.get("name", ""),
+                        type=spec.get("type", ""),
+                        cluster_ip=spec.get("clusterIP", ""),
+                        external_ip=external_ip,
+                        ports=",".join(ports),
+                    )
+                )
+
+            return result
+        except Exception:
+            return []
+
+    # =========================================================================
+    # Cert-Manager Operations
+    # =========================================================================
+
+    async def check_cert_manager_installed(self) -> bool:
+        """Check if cert-manager is installed in the cluster."""
+        try:
+            api = await self._get_api()
+            pods = [pod async for pod in Pod.list(namespace="cert-manager", api=api)]
+            return len(pods) > 0
+        except Exception:
+            return False
+
+    async def get_cluster_issuer_status(
+        self,
+        issuer_name: str,
+    ) -> ClusterIssuerStatus:
+        """Get the status of a cert-manager ClusterIssuer.
+
+        Note: Uses kubectl as ClusterIssuer is a CRD.
+        """
+        import subprocess
+
+        def _run() -> ClusterIssuerStatus:
+            result = subprocess.run(
+                [
+                    "kubectl",
+                    "get",
+                    "clusterissuer",
+                    issuer_name,
+                    "-o",
+                    "json",
+                ],
+                capture_output=True,
+                text=True,
+            )
+
+            if result.returncode != 0:
+                return ClusterIssuerStatus(
+                    exists=False,
+                    ready=False,
+                    message="ClusterIssuer not found",
+                )
+
+            try:
+                import json
+
+                data = json.loads(result.stdout)
+                conditions = data.get("status", {}).get("conditions", [])
+
+                ready = False
+                message = ""
+
+                for condition in conditions:
+                    if condition.get("type") == "Ready":
+                        ready = condition.get("status") == "True"
+                        message = condition.get("message", "")
+                        break
+
+                return ClusterIssuerStatus(
+                    exists=True,
+                    ready=ready,
+                    message=message,
+                )
+            except Exception:
+                return ClusterIssuerStatus(
+                    exists=True,
+                    ready=False,
+                    message="Failed to parse ClusterIssuer status",
+                )
+
+        return await asyncio.to_thread(_run)
+
+    async def get_cluster_issuer_yaml(self, issuer_name: str) -> str | None:
+        """Get the YAML representation of a ClusterIssuer.
+
+        Note: Uses kubectl as ClusterIssuer is a CRD.
+        """
+        import subprocess
+
+        def _run() -> str | None:
+            result = subprocess.run(
+                ["kubectl", "get", "clusterissuer", issuer_name, "-o", "yaml"],
+                capture_output=True,
+                text=True,
+            )
+            return result.stdout if result.returncode == 0 else None
+
+        return await asyncio.to_thread(_run)
+
+    # =========================================================================
+    # Helpers
+    # =========================================================================
+
+    def _parse_timeout(self, timeout: str) -> float:
+        """Parse a timeout string like '120s' or '5m' to seconds."""
+        if timeout.endswith("s"):
+            return float(timeout[:-1])
+        elif timeout.endswith("m"):
+            return float(timeout[:-1]) * 60
+        elif timeout.endswith("h"):
+            return float(timeout[:-1]) * 3600
+        else:
+            return float(timeout)
diff --git a/src/infra/k8s/kubectl_controller.py b/src/infra/k8s/kubectl_controller.py
new file mode 100644
index 0000000..376e366
--- /dev/null
+++ b/src/infra/k8s/kubectl_controller.py
@@ -0,0 +1,594 @@
+"""Kubectl-based implementation of KubernetesController.
+
+Uses subprocess calls to kubectl for all operations.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+from datetime import datetime
+from pathlib import Path
+
+from .controller import (
+    ClusterIssuerStatus,
+    CommandResult,
+    JobInfo,
+    KubernetesController,
+    PodInfo,
+    ReplicaSetInfo,
+    ServiceInfo,
+)
+
+
+class KubectlController(KubernetesController):
+    """Kubernetes controller using kubectl subprocess calls.
+
+    All methods are async but internally use asyncio.to_thread()
+    to run blocking subprocess calls without blocking the event loop.
+    """
+
+    async def _run_kubectl(
+        self,
+        args: list[str],
+        *,
+        capture_output: bool = True,
+        input_data: str | None = None,
+    ) -> CommandResult:
+        """Run a kubectl command asynchronously.
+
+        Args:
+            args: Command arguments (without 'kubectl' prefix)
+            capture_output: Whether to capture stdout/stderr
+            input_data: Optional input to send to stdin
+
+        Returns:
+            CommandResult with execution results
+        """
+        import subprocess
+
+        cmd = ["kubectl", *args]
+
+        def _run() -> CommandResult:
+            result = subprocess.run(
+                cmd,
+                capture_output=capture_output,
+                text=True,
+                input=input_data,
+            )
+            return CommandResult(
+                success=result.returncode == 0,
+                stdout=result.stdout or "",
+                stderr=result.stderr or "",
+                returncode=result.returncode,
+            )
+
+        return await asyncio.to_thread(_run)
+
+    # =========================================================================
+    # Cluster Context
+    # =========================================================================
+
+    async def get_current_context(self) -> str:
+        """Get the current kubectl context name."""
+        result = await self._run_kubectl(["config", "current-context"])
+        return result.stdout.strip() if result.success else "unknown"
+
+    async def is_minikube_context(self) -> bool:
+        """Check if the current kubectl context is Minikube."""
+        result = await self._run_kubectl(["config", "current-context"])
+        if not result.success:
+            return False
+        return "minikube" in result.stdout.strip().lower()
+
+    # =========================================================================
+    # Namespace Operations
+    # =========================================================================
+
+    async def namespace_exists(self, namespace: str) -> bool:
+        """Check if a namespace exists."""
+        result = await self._run_kubectl(["get", "namespace", namespace])
+        return result.success
+
+    async def delete_namespace(
+        self,
+        namespace: str,
+        *,
+        wait: bool = True,
+        timeout: str = "120s",
+    ) -> CommandResult:
+        """Delete a Kubernetes namespace and all its resources."""
+        args = ["delete", "namespace", namespace]
+        if wait:
+            args.append("--wait=true")
+            args.extend(["--timeout", timeout])
+        return await self._run_kubectl(args)
+
+    async def delete_pvcs(self, namespace: str) -> CommandResult:
+        """Delete all PersistentVolumeClaims in a namespace."""
+        return await self._run_kubectl(["delete", "pvc", "--all", "-n", namespace])
+
+    # =========================================================================
+    # Resource Operations
+    # =========================================================================
+
+    async def apply_manifest(self, manifest_path: Path) -> CommandResult:
+        """Apply a Kubernetes manifest file."""
+        return await self._run_kubectl(["apply", "-f", str(manifest_path)])
+
+    async def delete_resources_by_label(
+        self,
+        resource_types: str,
+        namespace: str,
+        label_selector: str,
+        *,
+        force: bool = False,
+    ) -> CommandResult:
+        """Delete Kubernetes resources matching a label selector."""
+        args = [
+            "delete",
+            resource_types,
+            "-n",
+            namespace,
+            "-l",
+            label_selector,
+        ]
+        if force:
+            args.extend(["--force", "--grace-period=0"])
+        return await self._run_kubectl(args)
+
+    async def delete_helm_secrets(
+        self,
+        namespace: str,
+        release_name: str,
+    ) -> CommandResult:
+        """Delete Helm release metadata secrets."""
+        return await self._run_kubectl(
+            [
+                "delete",
+                "secret",
+                "-n",
+                namespace,
+                "-l",
+                f"name={release_name},owner=helm",
+            ]
+        )
+
+    # =========================================================================
+    # Deployment Operations
+    # =========================================================================
+
+    async def get_deployments(self, namespace: str) -> list[str]:
+        """Get list of deployment names in a namespace."""
+        result = await self._run_kubectl(
+            [
+                "get",
+                "deployments",
+                "-n",
+                namespace,
+                "-o",
+                "jsonpath={.items[*].metadata.name}",
+            ]
+        )
+        if not result.success or not result.stdout:
+            return []
+        return result.stdout.strip().split()
+
+    async def rollout_restart(
+        self,
+        resource_type: str,
+        namespace: str,
+        name: str | None = None,
+    ) -> CommandResult:
+        """Trigger a rolling restart of a deployment/daemonset/statefulset."""
+        if name:
+            args = [
+                "rollout",
+                "restart",
+                resource_type,
+                name,
+                "-n",
+                namespace,
+            ]
+        else:
+            args = ["rollout", "restart", resource_type, "-n", namespace]
+        return await self._run_kubectl(args)
+
+    async def rollout_status(
+        self,
+        resource_type: str,
+        namespace: str,
+        name: str | None = None,
+        *,
+        timeout: str = "300s",
+    ) -> CommandResult:
+        """Wait for a rollout to complete."""
+        if name:
+            args = [
+                "rollout",
+                "status",
+                resource_type,
+                name,
+                "-n",
+                namespace,
+                f"--timeout={timeout}",
+            ]
+        else:
+            args = [
+                "rollout",
+                "status",
+                resource_type,
+                "-n",
+                namespace,
+                f"--timeout={timeout}",
+            ]
+        return await self._run_kubectl(args, capture_output=False)
+
+    async def get_deployment_revision(
+        self,
+        name: str,
+        namespace: str,
+    ) -> str | None:
+        """Get the current revision number of a deployment."""
+        result = await self._run_kubectl(
+            [
+                "get",
+                "deployment",
+                name,
+                "-n",
+                namespace,
+                "-o",
+                "jsonpath={.metadata.annotations.deployment\\.kubernetes\\.io/revision}",
+            ]
+        )
+        return result.stdout.strip() if result.success and result.stdout else None
+
+    # =========================================================================
+    # ReplicaSet Operations
+    # =========================================================================
+
+    async def get_replicasets(self, namespace: str) -> list[ReplicaSetInfo]:
+        """Get all ReplicaSets in a namespace."""
+        result = await self._run_kubectl(
+            ["get", "replicasets", "-n", namespace, "-o", "json"]
+        )
+        if not result.success or not result.stdout:
+            return []
+
+        try:
+            data = json.loads(result.stdout)
+            replicasets = []
+
+            for rs in data.get("items", []):
+                metadata = rs.get("metadata", {})
+                spec = rs.get("spec", {})
+                annotations = metadata.get("annotations", {})
+                owner_refs = metadata.get("ownerReferences", [])
+
+                # Parse creation timestamp
+                created_at = None
+                if creation_ts := metadata.get("creationTimestamp"):
+                    try:
+                        created_at = datetime.fromisoformat(
+                            creation_ts.replace("Z", "+00:00")
+                        )
+                    except ValueError:
+                        pass
+
+                # Get owner deployment name
+                owner_deployment = None
+                if owner_refs:
+                    owner_deployment = owner_refs[0].get("name")
+
+                replicasets.append(
+                    ReplicaSetInfo(
+                        name=metadata.get("name", ""),
+                        replicas=spec.get("replicas", 0),
+                        revision=annotations.get(
+                            "deployment.kubernetes.io/revision", ""
+                        ),
+                        created_at=created_at,
+                        owner_deployment=owner_deployment,
+                    )
+                )
+
+            return replicasets
+        except json.JSONDecodeError:
+            return []
+
+    async def delete_replicaset(
+        self,
+        name: str,
+        namespace: str,
+    ) -> CommandResult:
+        """Delete a specific ReplicaSet."""
+        return await self._run_kubectl(["delete", "replicaset", name, "-n", namespace])
+
+    async def scale_replicaset(
+        self,
+        name: str,
+        namespace: str,
+        replicas: int,
+    ) -> CommandResult:
+        """Scale a ReplicaSet to a specific number of replicas."""
+        return await self._run_kubectl(
+            [
+                "scale",
+                "replicaset",
+                name,
+                f"--replicas={replicas}",
+                "-n",
+                namespace,
+            ]
+        )
+
+    # =========================================================================
+    # Pod Operations
+    # =========================================================================
+
+    async def get_pods(self, namespace: str) -> list[PodInfo]:
+        """Get all pods in a namespace with their status."""
+        result = await self._run_kubectl(["get", "pods", "-n", namespace, "-o", "json"])
+        if not result.success or not result.stdout:
+            return []
+
+        try:
+            data = json.loads(result.stdout)
+            pods = []
+
+            for pod in data.get("items", []):
+                metadata = pod.get("metadata", {})
+                name = metadata.get("name", "")
+                creation_timestamp = metadata.get("creationTimestamp", "")
+                status = pod.get("status", {})
+
+                # Check if pod is owned by a Job
+                job_owner = ""
+                for owner_ref in metadata.get("ownerReferences", []):
+                    if owner_ref.get("kind") == "Job":
+                        job_owner = owner_ref.get("name", "")
+                        break
+
+                # Determine pod status
+                phase = status.get("phase", "Unknown")
+                container_statuses = status.get("containerStatuses", [])
+
+                pod_status = phase
+                restarts = 0
+
+                for cs in container_statuses:
+                    restarts += cs.get("restartCount", 0)
+                    state = cs.get("state", {})
+                    if "waiting" in state:
+                        reason = state["waiting"].get("reason", "")
+                        if reason:
+                            pod_status = reason
+                    elif "terminated" in state:
+                        reason = state["terminated"].get("reason", "")
+                        if reason == "Error":
+                            pod_status = "Error"
+
+                pods.append(
+                    PodInfo(
+                        name=name,
+                        status=pod_status,
+                        restarts=restarts,
+                        creation_timestamp=creation_timestamp,
+                        job_owner=job_owner,
+                        ip=status.get("podIP", ""),
+                        node=spec.get("nodeName", "")
+                        if (spec := pod.get("spec"))
+                        else "",
+                    )
+                )
+
+            return pods
+        except json.JSONDecodeError:
+            return []
+
+    async def wait_for_pods(
+        self,
+        namespace: str,
+        label_selector: str,
+        *,
+        condition: str = "ready",
+        timeout: str = "300s",
+    ) -> CommandResult:
+        """Wait for pods matching a selector to reach a condition."""
+        return await self._run_kubectl(
+            [
+                "wait",
+                "--for",
+                f"condition={condition}",
+                "pod",
+                "-l",
+                label_selector,
+                "-n",
+                namespace,
+                f"--timeout={timeout}",
+            ],
+            capture_output=False,
+        )
+
+    async def get_pod_logs(
+        self,
+        namespace: str,
+        pod: str | None = None,
+        *,
+        container: str | None = None,
+        label_selector: str | None = None,
+        follow: bool = False,
+        tail: int = 100,
+        previous: bool = False,
+    ) -> CommandResult:
+        """Get logs from Kubernetes pods."""
+        args = ["logs", "-n", namespace]
+
+        if pod:
+            args.append(pod)
+        elif label_selector:
+            args.extend(["-l", label_selector, "--all-containers=true"])
+
+        if container:
+            args.extend(["-c", container])
+
+        if follow:
+            args.append("-f")
+
+        args.append(f"--tail={tail}")
+
+        if previous:
+            args.append("--previous")
+
+        return await self._run_kubectl(args, capture_output=not follow)
+
+    # =========================================================================
+    # Job Operations
+    # =========================================================================
+
+    async def get_jobs(self, namespace: str) -> list[JobInfo]:
+        """Get all jobs in a namespace with their status."""
+        result = await self._run_kubectl(["get", "jobs", "-n", namespace, "-o", "json"])
+        if not result.success or not result.stdout:
+            return []
+
+        try:
+            data = json.loads(result.stdout)
+            jobs = []
+
+            for job in data.get("items", []):
+                name = job.get("metadata", {}).get("name", "")
+                status = job.get("status", {})
+
+                if status.get("succeeded", 0) > 0:
+                    job_status = "Complete"
+                elif status.get("failed", 0) > 0:
+                    job_status = "Failed"
+                elif status.get("active", 0) > 0:
+                    job_status = "Running"
+                else:
+                    job_status = "Unknown"
+
+                jobs.append(JobInfo(name=name, status=job_status))
+
+            return jobs
+        except json.JSONDecodeError:
+            return []
+
+    # =========================================================================
+    # Service Operations
+    # =========================================================================
+
+    async def get_services(self, namespace: str) -> list[ServiceInfo]:
+        """Get all services in a namespace."""
+        result = await self._run_kubectl(
+            ["get", "services", "-n", namespace, "-o", "json"]
+        )
+        if not result.success or not result.stdout:
+            return []
+
+        try:
+            data = json.loads(result.stdout)
+            services = []
+
+            for svc in data.get("items", []):
+                metadata = svc.get("metadata", {})
+                spec = svc.get("spec", {})
+                status = svc.get("status", {})
+
+                # Get external IP from LoadBalancer status
+                external_ip = ""
+                lb_ingress = status.get("loadBalancer", {}).get("ingress", [])
+                if lb_ingress:
+                    external_ip = lb_ingress[0].get(
+                        "ip", lb_ingress[0].get("hostname", "")
+                    )
+
+                # Format ports
+                ports = []
+                for port in spec.get("ports", []):
+                    port_str = f"{port.get('port')}"
+                    if target := port.get("targetPort"):
+                        port_str += f":{target}"
+                    if proto := port.get("protocol"):
+                        port_str += f"/{proto}"
+                    ports.append(port_str)
+
+                services.append(
+                    ServiceInfo(
+                        name=metadata.get("name", ""),
+                        type=spec.get("type", ""),
+                        cluster_ip=spec.get("clusterIP", ""),
+                        external_ip=external_ip,
+                        ports=",".join(ports),
+                    )
+                )
+
+            return services
+        except json.JSONDecodeError:
+            return []
+
+    # =========================================================================
+    # Cert-Manager Operations
+    # =========================================================================
+
+    async def check_cert_manager_installed(self) -> bool:
+        """Check if cert-manager is installed in the cluster."""
+        result = await self._run_kubectl(
+            ["get", "pods", "-n", "cert-manager", "-o", "name"]
+        )
+        return result.success and bool(result.stdout.strip())
+
+    async def get_cluster_issuer_status(
+        self,
+        issuer_name: str,
+    ) -> ClusterIssuerStatus:
+        """Get the status of a cert-manager ClusterIssuer."""
+        result = await self._run_kubectl(
+            [
+                "get",
+                "clusterissuer",
+                issuer_name,
+                "-o",
+                "json",
+            ]
+        )
+
+        if not result.success:
+            return ClusterIssuerStatus(
+                exists=False,
+                ready=False,
+                message="ClusterIssuer not found",
+            )
+
+        try:
+            data = json.loads(result.stdout)
+            conditions = data.get("status", {}).get("conditions", [])
+
+            ready = False
+            message = ""
+
+            for condition in conditions:
+                if condition.get("type") == "Ready":
+                    ready = condition.get("status") == "True"
+                    message = condition.get("message", "")
+                    break
+
+            return ClusterIssuerStatus(
+                exists=True,
+                ready=ready,
+                message=message,
+            )
+        except json.JSONDecodeError:
+            return ClusterIssuerStatus(
+                exists=True,
+                ready=False,
+                message="Failed to parse ClusterIssuer status",
+            )
+
+    async def get_cluster_issuer_yaml(self, issuer_name: str) -> str | None:
+        """Get the YAML representation of a ClusterIssuer."""
+        result = await self._run_kubectl(
+            ["get", "clusterissuer", issuer_name, "-o", "yaml"]
+        )
+        return result.stdout if result.success else None
diff --git a/src/infra/k8s/utils.py b/src/infra/k8s/utils.py
new file mode 100644
index 0000000..a7b1960
--- /dev/null
+++ b/src/infra/k8s/utils.py
@@ -0,0 +1,47 @@
+"""Utility functions for the Kubernetes infrastructure layer.
+
+Provides helper functions for running async code in sync contexts
+and other common utilities.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from collections.abc import Coroutine
+from typing import Any
+
+
+def run_sync[T](coro: Coroutine[Any, Any, T]) -> T:
+    """Run an async coroutine in a blocking sync context.
+
+    This is useful for calling async KubernetesController methods
+    from synchronous CLI commands.
+
+    Args:
+        coro: The coroutine to execute
+
+    Returns:
+        The result of the coroutine
+
+    Example:
+        from src.infra.k8s import KubectlController, run_sync
+
+        controller = KubectlController()
+        pods = run_sync(controller.get_pods("my-namespace"))
+    """
+    try:
+        loop = asyncio.get_running_loop()
+    except RuntimeError:
+        # No running loop, create a new one
+        return asyncio.run(coro)
+    else:
+        # We're inside an async context, use run_until_complete
+        # This handles nested async calls
+        if loop.is_running():
+            # Create a new loop in a thread to avoid blocking
+            import concurrent.futures
+
+            with concurrent.futures.ThreadPoolExecutor() as pool:
+                future = pool.submit(asyncio.run, coro)
+                return future.result()
+        return loop.run_until_complete(coro)
diff --git a/tests/e2e/test_copier_to_deployment.py b/tests/e2e/test_copier_to_deployment.py
index 57dda63..4aa87bb 100644
--- a/tests/e2e/test_copier_to_deployment.py
+++ b/tests/e2e/test_copier_to_deployment.py
@@ -536,9 +536,8 @@ def test_07_docker_compose_prod_deployment(self):
                     "uv",
                     "run",
                     "api-forge-cli",
-                    "deploy",
-                    "down",
                     "prod",
+                    "down",
                     "--volumes",
                     "--yes",
                 ],
@@ -638,7 +637,7 @@ def test_07_docker_compose_prod_deployment(self):
             # Start production deployment
             try:
                 result = self.run_command(
-                    ["uv", "run", "api-forge-cli", "deploy", "up", "prod"],
+                    ["uv", "run", "api-forge-cli", "prod", "up"],
                     cwd=project_dir,
                     timeout=600,  # 10 minutes for building images in CI
                 )
@@ -714,7 +713,7 @@ def test_07_docker_compose_prod_deployment(self):
 
             # Check deployment status
             result = self.run_command(
-                ["uv", "run", "api-forge-cli", "deploy", "status", "prod"],
+                ["uv", "run", "api-forge-cli", "prod", "status"],
                 cwd=project_dir,
             )
 
@@ -817,9 +816,8 @@ def test_07_docker_compose_prod_deployment(self):
                     "uv",
                     "run",
                     "api-forge-cli",
-                    "deploy",
-                    "down",
                     "prod",
+                    "down",
                     "--volumes",
                     "--yes",
                 ],
@@ -935,7 +933,7 @@ def test_08_kubernetes_deployment(self):
             # Deploy to Kubernetes (with real-time output streaming)
             print("🚀 Starting K8s deployment with real-time output...")
             result = self.run_command(
-                ["uv", "run", "api-forge-cli", "deploy", "up", "k8s"],
+                ["uv", "run", "api-forge-cli", "k8s", "up"],
                 cwd=project_dir,
                 timeout=600,
                 stream_output=True,
diff --git a/tests/unit/cli/deployment/test_validator.py b/tests/unit/cli/deployment/test_validator.py
index 6dbf336..c2d2e7c 100644
--- a/tests/unit/cli/deployment/test_validator.py
+++ b/tests/unit/cli/deployment/test_validator.py
@@ -13,6 +13,7 @@
     ValidationResult,
     ValidationSeverity,
 )
+from src.infra.k8s.controller import JobInfo, PodInfo
 
 
 class TestValidationResult:
@@ -168,7 +169,7 @@ def test_validate_detects_failed_jobs(
         mock_commands.kubectl.namespace_exists.return_value = True
         mock_commands.helm.list_releases.return_value = []
         mock_commands.kubectl.get_jobs.return_value = [
-            {"name": "postgres-verifier", "status": "Failed"},
+            JobInfo(name="postgres-verifier", status="Failed"),
         ]
         mock_commands.kubectl.get_pods.return_value = []
 
@@ -188,7 +189,7 @@ def test_validate_any_failed_job_is_warning(
         mock_commands.kubectl.namespace_exists.return_value = True
         mock_commands.helm.list_releases.return_value = []
         mock_commands.kubectl.get_jobs.return_value = [
-            {"name": "migration-job", "status": "Failed"},
+            JobInfo(name="migration-job", status="Failed"),
         ]
         mock_commands.kubectl.get_pods.return_value = []
 
@@ -208,7 +209,7 @@ def test_validate_detects_crashloop_pods(
         mock_commands.helm.list_releases.return_value = []
         mock_commands.kubectl.get_jobs.return_value = []
         mock_commands.kubectl.get_pods.return_value = [
-            {"name": "api-forge-app-xyz", "status": "CrashLoopBackOff"},
+            PodInfo(name="api-forge-app-xyz", status="CrashLoopBackOff"),
         ]
 
         result = validator.validate("api-forge-prod")
@@ -227,7 +228,7 @@ def test_validate_detects_pending_pods(
         mock_commands.helm.list_releases.return_value = []
         mock_commands.kubectl.get_jobs.return_value = []
         mock_commands.kubectl.get_pods.return_value = [
-            {"name": "api-forge-app-xyz", "status": "Pending"},
+            PodInfo(name="api-forge-app-xyz", status="Pending"),
         ]
 
         result = validator.validate("api-forge-prod")
@@ -246,7 +247,7 @@ def test_validate_detects_error_pods(
         mock_commands.helm.list_releases.return_value = []
         mock_commands.kubectl.get_jobs.return_value = []
         mock_commands.kubectl.get_pods.return_value = [
-            {"name": "api-forge-app-xyz", "status": "Error"},
+            PodInfo(name="api-forge-app-xyz", status="Error"),
         ]
 
         result = validator.validate("api-forge-prod")
@@ -269,19 +270,19 @@ def test_validate_job_pods_only_checks_most_recent(
         mock_commands.kubectl.get_jobs.return_value = []
         mock_commands.kubectl.get_pods.return_value = [
             # Old pod from first attempt - failed
-            {
-                "name": "postgres-verifier-abc",
-                "status": "Error",
-                "jobOwner": "postgres-verifier",
-                "creationTimestamp": "2025-01-01T10:00:00Z",
-            },
+            PodInfo(
+                name="postgres-verifier-abc",
+                status="Error",
+                job_owner="postgres-verifier",
+                creation_timestamp="2025-01-01T10:00:00Z",
+            ),
             # Newer pod from second attempt - succeeded
-            {
-                "name": "postgres-verifier-def",
-                "status": "Succeeded",
-                "jobOwner": "postgres-verifier",
-                "creationTimestamp": "2025-01-01T10:05:00Z",
-            },
+            PodInfo(
+                name="postgres-verifier-def",
+                status="Succeeded",
+                job_owner="postgres-verifier",
+                creation_timestamp="2025-01-01T10:05:00Z",
+            ),
         ]
 
         result = validator.validate("api-forge-prod")
@@ -299,19 +300,19 @@ def test_validate_job_pods_flags_if_most_recent_failed(
         mock_commands.kubectl.get_jobs.return_value = []
         mock_commands.kubectl.get_pods.return_value = [
             # Old pod succeeded
-            {
-                "name": "postgres-verifier-abc",
-                "status": "Succeeded",
-                "jobOwner": "postgres-verifier",
-                "creationTimestamp": "2025-01-01T10:00:00Z",
-            },
+            PodInfo(
+                name="postgres-verifier-abc",
+                status="Succeeded",
+                job_owner="postgres-verifier",
+                creation_timestamp="2025-01-01T10:00:00Z",
+            ),
             # Newer pod failed
-            {
-                "name": "postgres-verifier-def",
-                "status": "Error",
-                "jobOwner": "postgres-verifier",
-                "creationTimestamp": "2025-01-01T10:05:00Z",
-            },
+            PodInfo(
+                name="postgres-verifier-def",
+                status="Error",
+                job_owner="postgres-verifier",
+                creation_timestamp="2025-01-01T10:05:00Z",
+            ),
         ]
 
         result = validator.validate("api-forge-prod")
@@ -329,11 +330,11 @@ def test_validate_detects_multiple_issues(
         mock_commands.kubectl.namespace_exists.return_value = True
         mock_commands.helm.list_releases.return_value = []
         mock_commands.kubectl.get_jobs.return_value = [
-            {"name": "postgres-verifier", "status": "Failed"},
+            JobInfo(name="postgres-verifier", status="Failed"),
         ]
         mock_commands.kubectl.get_pods.return_value = [
-            {"name": "api-forge-app-xyz", "status": "CrashLoopBackOff"},
-            {"name": "api-forge-worker-abc", "status": "Pending"},
+            PodInfo(name="api-forge-app-xyz", status="CrashLoopBackOff"),
+            PodInfo(name="api-forge-worker-abc", status="Pending"),
         ]
 
         result = validator.validate("api-forge-prod")
diff --git a/uv.lock b/uv.lock
index ddd3a2d..0d2d1de 100644
--- a/uv.lock
+++ b/uv.lock
@@ -49,6 +49,7 @@ dependencies = [
     { name = "fastapi-limiter" },
     { name = "httpx" },
     { name = "jinja2" },
+    { name = "kr8s" },
     { name = "loguru" },
     { name = "psycopg2-binary" },
     { name = "pydantic", extra = ["email"] },
@@ -87,6 +88,7 @@ requires-dist = [
     { name = "fastapi-limiter", specifier = ">=0.1.6" },
     { name = "httpx", specifier = ">=0.27.2" },
     { name = "jinja2", specifier = ">=3.1.0" },
+    { name = "kr8s", specifier = ">=0.20.14" },
     { name = "loguru", specifier = ">=0.7.3" },
     { name = "psycopg2-binary", specifier = ">=2.9.11" },
     { name = "pydantic", extras = ["email"], specifier = ">=2.11.9" },
@@ -589,6 +591,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
 ]
 
+[[package]]
+name = "httpx-ws"
+version = "0.8.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "httpcore" },
+    { name = "httpx" },
+    { name = "wsproto" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4a/32/6f7198f55d94063ea84487a31cdd3e149d2702dc0804fc5de06ed12ef2c2/httpx_ws-0.8.2.tar.gz", hash = "sha256:ba0d4aa76e1c8a27bd5e88984ecdcdc28f7bf30b40cb0989a4c1438d07fa52c7", size = 105734, upload-time = "2025-11-07T12:57:36.566Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/29/cd/2008972ddc4c2139b9813d8a097e53dcc74b2a16a85b4069294457954232/httpx_ws-0.8.2-py3-none-any.whl", hash = "sha256:f8898ddb84cbf98c562e8e796675bc68c215fa1d453d54a7fcd935aca8198cc8", size = 15404, upload-time = "2025-11-07T12:57:35.176Z" },
+]
+
 [[package]]
 name = "identify"
 version = "2.6.15"
@@ -641,6 +658,26 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/72/b9/313e8f2f2e9517ae050a692ae7b3e4b3f17cc5e6dfea0db51fe14e586580/jinja2_ansible_filters-1.3.2-py3-none-any.whl", hash = "sha256:e1082f5564917649c76fed239117820610516ec10f87735d0338688800a55b34", size = 18975, upload-time = "2022-06-30T14:08:49.571Z" },
 ]
 
+[[package]]
+name = "kr8s"
+version = "0.20.14"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "cachetools" },
+    { name = "cryptography" },
+    { name = "httpx" },
+    { name = "httpx-ws" },
+    { name = "packaging" },
+    { name = "python-box" },
+    { name = "python-jsonpath" },
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a1/66/90689a4d960c4fb245d9baf43d9648fe93cd957ddb7f12ce2c12f7ff9700/kr8s-0.20.14.tar.gz", hash = "sha256:e9f859359de0a9c511ee83b119bd1d2a928ee15d59daaf9fc2f11bd37c2bd67b", size = 2838630, upload-time = "2025-12-01T15:37:41.151Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/da/fc/7d15dd15dc6fada3cae8baa1ae65fe07f9d5ee03d20353a0fef08067cfba/kr8s-0.20.14-py3-none-any.whl", hash = "sha256:33384c0d2e261e95e8f146415dc72f9b255e6632dfe746540790332560184546", size = 86660, upload-time = "2025-12-01T15:37:39.757Z" },
+]
+
 [[package]]
 name = "loguru"
 version = "0.7.3"
@@ -1039,6 +1076,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ca/31/d4e37e9e550c2b92a9cbc2e4d0b7420a27224968580b5a447f420847c975/pytest_xdist-3.8.0-py3-none-any.whl", hash = "sha256:202ca578cfeb7370784a8c33d6d05bc6e13b4f25b5053c30a152269fd10f0b88", size = 46396, upload-time = "2025-07-01T13:30:56.632Z" },
 ]
 
+[[package]]
+name = "python-box"
+version = "7.3.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/29/f7/635eed8c500adf26208e86e985bbffb6ff039cd8950e3a4749ceca904218/python_box-7.3.2.tar.gz", hash = "sha256:028b9917129e67f311932d93347b8a4f1b500d7a5a2870ee3c035f4e7b19403b", size = 45771, upload-time = "2025-01-16T19:10:05.221Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2f/39/8bec609e93dbc5e0d3ea26cfb5af3ca78915f7a55ef5414713462fedeb59/python_box-7.3.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:1dfc3b9b073f3d7cad1fa90de98eaaa684a494d0574bbc0666f74fa8307fd6b6", size = 1804675, upload-time = "2025-01-16T19:10:23.281Z" },
+    { url = "https://files.pythonhosted.org/packages/88/ae/baf3a8057d8129896a7e02619df43ea0d918fc5b2bb66eb6e2470595fbac/python_box-7.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ca4685a7f764b5a71b6e08535ce2a96b7964bb63d8cb4df10f6bb7147b6c54b", size = 4265645, upload-time = "2025-01-16T19:15:34.087Z" },
+    { url = "https://files.pythonhosted.org/packages/43/90/72367e03033c11a5e82676ee389b572bf136647ff4e3081557392b37e1ad/python_box-7.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:e143295f74d47a9ab24562ead2375c9be10629599b57f2e86717d3fff60f82a9", size = 1206740, upload-time = "2025-01-16T19:11:30.635Z" },
+    { url = "https://files.pythonhosted.org/packages/37/13/8a990c6e2b6cc12700dce16f3cb383324e6d9a30f604eca22a2fdf84c923/python_box-7.3.2-py3-none-any.whl", hash = "sha256:fd7d74d5a848623f93b5221fd9fb00b8c00ff0e130fa87f396277aa188659c92", size = 29479, upload-time = "2025-01-16T19:10:02.749Z" },
+]
+
 [[package]]
 name = "python-dotenv"
 version = "1.1.1"
@@ -1048,6 +1097,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5f/ed/539768cf28c661b5b068d66d96a2f155c4971a5d55684a514c1a0e0dec2f/python_dotenv-1.1.1-py3-none-any.whl", hash = "sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc", size = 20556, upload-time = "2025-06-24T04:21:06.073Z" },
 ]
 
+[[package]]
+name = "python-jsonpath"
+version = "2.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f3/db/f1f19205b0df6eb0195de154dc6c967448802dfb573487fa8a4206a243cd/python_jsonpath-2.0.1.tar.gz", hash = "sha256:32a84ebb2dc0ec1b42a6e165b0f9174aef8310bad29154ad9aee31ac37cca18f", size = 49659, upload-time = "2025-09-13T08:01:47.82Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d8/d4/64d7cdc01269f5fed45e6a69f5395c30451958c299ca5cbc1442a4f3f9b9/python_jsonpath-2.0.1-py3-none-any.whl", hash = "sha256:ebd518b7c883acc5b976518d76b6c96288405edec7d9ef838641869c1e1a5eb7", size = 64060, upload-time = "2025-09-13T08:01:46.184Z" },
+]
+
 [[package]]
 name = "pywin32"
 version = "311"
@@ -1493,3 +1551,15 @@ sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b66
 wheels = [
     { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" },
 ]
+
+[[package]]
+name = "wsproto"
+version = "1.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "h11" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c7/79/12135bdf8b9c9367b8701c2c19a14c913c120b882d50b014ca0d38083c2c/wsproto-1.3.2.tar.gz", hash = "sha256:b86885dcf294e15204919950f666e06ffc6c7c114ca900b060d6e16293528294", size = 50116, upload-time = "2025-11-20T18:18:01.871Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a4/f5/10b68b7b1544245097b2a1b8238f66f2fc6dcaeb24ba5d917f52bd2eed4f/wsproto-1.3.2-py3-none-any.whl", hash = "sha256:61eea322cdf56e8cc904bd3ad7573359a242ba65688716b0710a5eb12beab584", size = 24405, upload-time = "2025-11-20T18:18:00.454Z" },
+]