diff --git a/.github/headers/LICENSE b/.github/headers/LICENSE
index a3f12d28d..7760ae7c6 100644
--- a/.github/headers/LICENSE
+++ b/.github/headers/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
diff --git a/.gitignore b/.gitignore
index d19ccad4e..0c9fbd23a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,13 @@ code-quality-report.json
go.work
go.work.sum
+# Local tool binaries (managed by api/Makefile)
+api/bin/*
+
+# Server binary output
+bin/
+/device-api-server
+
# ==============================================================================
# IDE & Editor Configurations
# ==============================================================================
@@ -48,3 +55,9 @@ go.work.sum
# Emacs
*~
\#*\#
+
+
+# ==============================================================================
+# Git Worktrees
+# ==============================================================================
+.worktrees/
diff --git a/.versions.yaml b/.versions.yaml
index 122a33f86..15a409121 100644
--- a/.versions.yaml
+++ b/.versions.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -34,6 +34,6 @@ go_tools:
# Protocol Buffers / gRPC
protobuf:
- protobuf: 'v33.0'
+ protobuf: 'v33.4'
protoc_gen_go: 'v1.36.10'
protoc_gen_go_grpc: 'v1.5.1'
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 9dbdcf56a..e7e16ac6f 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -1,18 +1,130 @@
-# Development Guide
+# NVIDIA Device API: Development Guide
+
+This guide covers the development setup and workflows for contributing to the NVIDIA Device API.
+
+## Module Structure
+
+This repository is a multi-module monorepo containing multiple Go modules:
+
+| Module | Path | Description |
+|--------|------|-------------|
+| `github.com/nvidia/nvsentinel` | `/` | Device API Server implementation |
+| `github.com/nvidia/nvsentinel/api` | `/api` | API definitions (protobuf and Go types) |
+| `github.com/nvidia/nvsentinel/client-go` | `/client-go` | Kubernetes-style gRPC clients |
+| `github.com/nvidia/nvsentinel/code-generator` | `/code-generator` | Code generation tools |
+
+The API module is designed to be imported independently by consumers who only need the type definitions.
+
+## Architecture
+
+This project bridges **gRPC** (for node-local performance) with **Kubernetes API Machinery** (for developer experience).
+
+1. **Definitions**: `api/proto` (Wire format) and `api/device` (Go types).
+2. **Conversion**: `api/device/${version}/converter.go` maps gRPC messages to K8s-style structs.
+3. **Generation**: A pipeline driven by `code-generator/kube_codegen.sh`, which utilizes a modified `client-gen` to produce gRPC-backed Kubernetes clients in the `client-go` module.
+
+---
+
+## Code Generation Pipeline
+
+The NVIDIA Device API uses a multi-stage pipeline to bridge gRPC with Kubernetes API machinery. For module-specific details, see the [client-go Development Guide](./client-go/DEVELOPMENT.md).
+
+```mermaid
+graph TD
+ API["API Definitions
(nvidia/nvsentinel/api)"] -->|Input| CG(client-gen
*Custom Build*)
+ API -->|Input| LG(lister-gen)
+
+ CG -->|Generates| CLIENT[client/versioned]
+ LG -->|Generates| LISTERS[listers/]
+
+ CLIENT & LISTERS -->|Input| IG(informer-gen)
+ IG -->|Generates| INFORMERS[informers/]
+
+ CLIENT & LISTERS & INFORMERS -->|Final Output| SDK[Ready-to-use SDK]
+```
+
+### Build Sequence
+
+When you run `make code-gen` from the root, the following sequence is executed:
+
+1. **Protoc**: Compiles `.proto` into Go gRPC stubs in `api/gen/`.
+2. **DeepCopy**: Generates `runtime.Object` methods required for K8s compatibility.
+3. **Goverter**: Generates type conversion logic between Protobuf and Go structs.
+4. **Custom client-gen**: Orchestrated by `code-generator/kube_codegen.sh` to produce the versioned Clientset, Informers, and Listers in `client-go/`.
+
+---
+
+## Development Workflow
+
+1. **Modify**: Edit the Protobuf definitions in `api/proto` or Go types in `api/device`.
+2. **Update**: Update the conversion logic in `api/device/${version}/converter.go` to handle changes, if necessary.
+3. **Generate**: Run `make code-gen` from the root. This updates the gRPC stubs, helper methods, and the `client-go` SDK.
+4. **Verify**: Run `make verify-codegen` to ensure the workspace is consistent.
+5. **Test**: Add tests to the affected module and run `make test` from the root.
+
+> [!NOTE] Use the fake clients in `client-go/client/versioned/fake` for testing controllers without a real gRPC server.
+
+---
+
+## Code Standards & Compliance
+
+### Commit Messages & Signing (DCO)
+
+We follow the [Conventional Commits](https://www.conventionalcommits.org) specification. Additionally, all commits **must** be signed off to comply with the Developer Certificate of Origin (DCO).
+
+```bash
+# Example: feat, fix, docs, chore, refactor
+git commit -s -m "feat: add new GPU condition type"
+```
+
+### License Headers
+
+Every source file (.go, .proto, .sh, Makefile) must include the Apache 2.0 license header.
+
+- **Go/Proto Template**: See `api/hack/boilerplate.go.txt`.
+- **Year**: Ensure the copyright year is current.
---
-## Code Generation
+## Troubleshooting
-This project relies heavily on generated code to ensure consistency with the Kubernetes API machinery.
+### Tooling Not Found
+
+We use `.versions.yaml` to pin tool versions. Our Makefile attempts to use tools from your system path or download them to your Go bin directory.
+
+- **Verify Installation**: `which protoc` or `which yq`.
+- **Fix**: Ensure your `GOPATH/bin` is in your system `$PATH`:
+ ```bash
+ export PATH=$PATH:$(go env GOPATH)/bin
+ ```
+
+### Generated Code Out of Sync
+
+If the build fails or `make verify-codegen` returns an error, your generated artifacts are likely stale.
+
+```bash
+# Clean all generated files across the monorepo
+make clean
+
+# Re-run the full pipeline
+make code-gen
+```
+
+### Dependency Issues
+
+If you see "module not found" or checksum errors:
+
+```bash
+# Tidy all modules
+make tidy
+```
+
+---
-### Generation Pipeline
-The `make code-gen` command orchestrates several tools:
+## Getting Help
-1. **Protoc**: Generates gRPC Go bindings from `api/proto`.
-2. **Goverter**: Generates type-safe conversion logic between internal gRPC types and the Kubernetes-style API types defined in `api/device/`.
-3. **K8s Code-Gen**:
- - Generates `DeepCopy` methods for API types to support standard Kubernetes object manipulation.
- - Generates a versioned, typed **clientset**, along with **listers** and **informers**, providing a native `client-go` experience for consumers.
+- **Issues**: [Create an issue](https://github.com/NVIDIA/device-api/issues/new)
+- **Questions**: [Start a discussion](https://github.com/NVIDIA/device-api/discussions)
+- **Security**: Please refer to [SECURITY](SECURITY.md) for reporting vulnerabilities.
---
diff --git a/Makefile b/Makefile
index 79e7c5567..1dba8bcf7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -21,14 +21,28 @@
SHELL = /usr/bin/env bash -o pipefail
.SHELLFLAGS = -ec
-VERSION_PKG = github.com/nvidia/nvsentinel/pkg/util/version
-GIT_VERSION := $(shell git describe --tags --always --dirty)
-GIT_COMMIT := $(shell git rev-parse HEAD)
-BUILD_DATE := $(shell date -u +'%Y-%m-%dT%H:%M:%SZ')
-
-LDFLAGS := -X $(VERSION_PKG).GitVersion=$(GIT_VERSION) \
- -X $(VERSION_PKG).GitCommit=$(GIT_COMMIT) \
- -X $(VERSION_PKG).BuildDate=$(BUILD_DATE)
+# Go build settings
+GOOS ?= $(shell go env GOOS)
+GOARCH ?= $(shell go env GOARCH)
+VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo "dev")
+GIT_COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo "unknown")
+GIT_TREE_STATE ?= $(shell if git diff --quiet 2>/dev/null; then echo "clean"; else echo "dirty"; fi)
+BUILD_DATE ?= $(shell date -u +"%Y-%m-%dT%H:%M:%SZ")
+
+# Version package path for ldflags
+VERSION_PKG = github.com/nvidia/nvsentinel/pkg/version
+
+# Container settings
+CONTAINER_RUNTIME ?= docker
+IMAGE_REGISTRY ?= ghcr.io/nvidia/nvsentinel
+DOCKERFILE := deployments/container/Dockerfile
+
+# Linker flags
+LDFLAGS = -s -w \
+ -X $(VERSION_PKG).Version=$(VERSION) \
+ -X $(VERSION_PKG).GitCommit=$(GIT_COMMIT) \
+ -X $(VERSION_PKG).GitTreeState=$(GIT_TREE_STATE) \
+ -X $(VERSION_PKG).BuildDate=$(BUILD_DATE)
# ==============================================================================
# Targets
@@ -59,34 +73,134 @@ verify-codegen: code-gen ## Verify generated code is up-to-date.
exit 1; \
fi
-.PHONY: tidy
-tidy: ## Run go mod tidy
- go mod tidy
-
-##@ Build & Test
+##@ Build
.PHONY: build
-build: ## Build the device-apiserver binary.
- go build -ldflags "$(LDFLAGS)" -o bin/device-apiserver ./cmd/device-apiserver
+build: build-modules build-server ## Build all modules and server.
+
+.PHONY: build-modules
+build-modules: ## Build all modules.
+ @for mod in $(MODULES); do \
+ if [ -f $$mod/Makefile ]; then \
+ $(MAKE) -C $$mod build; \
+ fi \
+ done
+
+.PHONY: build-server
+build-server: ## Build the Device API Server
+ @echo "Building device-api-server..."
+ @mkdir -p bin
+ CGO_ENABLED=0 GOOS=$(GOOS) GOARCH=$(GOARCH) go build \
+ -ldflags "$(LDFLAGS)" \
+ -o bin/device-api-server \
+ ./cmd/device-api-server
+ @echo "Built bin/device-api-server"
+
+.PHONY: build-nvml-provider
+build-nvml-provider: ## Build the NVML Provider sidecar (requires CGO)
+ @echo "Building nvml-provider..."
+ @mkdir -p bin
+ CGO_ENABLED=1 GOOS=$(GOOS) GOARCH=$(GOARCH) go build \
+ -tags=nvml \
+ -ldflags "$(LDFLAGS)" \
+ -o bin/nvml-provider \
+ ./cmd/nvml-provider
+ @echo "Built bin/nvml-provider"
+
+##@ Testing
.PHONY: test
-test: ## Run unit tests.
- GOTOOLCHAIN=go1.25.5+auto go test -v $$(go list ./... | grep -vE '/pkg/client-go/(client|informers|listers)|/internal/generated/|/test/integration/|/examples/') -cover cover.out
+test: test-modules test-server ## Run tests in all modules.
+
+.PHONY: test-modules
+test-modules: ## Run tests in all modules.
+ @for mod in $(MODULES); do \
+ if [ -f $$mod/Makefile ]; then \
+ $(MAKE) -C $$mod test; \
+ fi \
+ done
+
+.PHONY: test-server
+test-server: ## Run server tests only
+ go test -race -v ./pkg/...
.PHONY: test-integration
-test-integration: ## Run integration tests.
+test-integration: ## Run integration tests
go test -v ./test/integration/...
+##@ Linting
+
.PHONY: lint
-lint: ## Run golangci-lint.
- golangci-lint run ./...
+lint: ## Run linting on all modules.
+ @for mod in $(MODULES); do \
+ if [ -f $$mod/Makefile ]; then \
+ $(MAKE) -C $$mod lint; \
+ fi \
+ done
+ go vet ./...
+
+##@ Container Images
+
+.PHONY: docker-build
+docker-build: docker-build-server docker-build-nvml-provider ## Build all container images
+
+.PHONY: docker-build-server
+docker-build-server: ## Build device-api-server container image
+ $(CONTAINER_RUNTIME) build \
+ --target device-api-server \
+ --build-arg VERSION=$(VERSION) \
+ --build-arg GIT_COMMIT=$(GIT_COMMIT) \
+ --build-arg GIT_TREE_STATE=$(GIT_TREE_STATE) \
+ --build-arg BUILD_DATE=$(BUILD_DATE) \
+ -t $(IMAGE_REGISTRY)/device-api-server:$(VERSION) \
+ -f $(DOCKERFILE) .
+
+.PHONY: docker-build-nvml-provider
+docker-build-nvml-provider: ## Build nvml-provider container image
+ $(CONTAINER_RUNTIME) build \
+ --target nvml-provider \
+ --build-arg VERSION=$(VERSION) \
+ --build-arg GIT_COMMIT=$(GIT_COMMIT) \
+ --build-arg GIT_TREE_STATE=$(GIT_TREE_STATE) \
+ --build-arg BUILD_DATE=$(BUILD_DATE) \
+ -t $(IMAGE_REGISTRY)/nvml-provider:$(VERSION) \
+ -f $(DOCKERFILE) .
+
+.PHONY: docker-push
+docker-push: ## Push all container images
+ $(CONTAINER_RUNTIME) push $(IMAGE_REGISTRY)/device-api-server:$(VERSION)
+ $(CONTAINER_RUNTIME) push $(IMAGE_REGISTRY)/nvml-provider:$(VERSION)
+
+##@ Helm
+
+.PHONY: helm-lint
+helm-lint: ## Lint Helm chart
+ helm lint deployments/helm/device-api-server
+
+.PHONY: helm-template
+helm-template: ## Render Helm chart templates
+ helm template device-api-server deployments/helm/device-api-server
+
+.PHONY: helm-package
+helm-package: ## Package Helm chart
+ @mkdir -p dist/
+ helm package deployments/helm/device-api-server -d dist/
+
+##@ Cleanup
.PHONY: clean
-clean: ## Remove generated artifacts.
- @echo "Cleaning generated artifacts..."
+clean: ## Clean generated artifacts in all modules.
+ @for mod in $(MODULES); do \
+ if [ -f $$mod/Makefile ]; then \
+ $(MAKE) -C $$mod clean; \
+ fi \
+ done
rm -rf bin/
- rm -rf internal/generated/
- rm -rf pkg/client-go/client/ pkg/client-go/informers/ pkg/client-go/listers/
- find api/ -name "zz_generated.deepcopy.go" -delete
- find api/ -name "zz_generated.goverter.go" -delete
- rm -f cover.out
+
+.PHONY: tidy
+tidy: ## Run go mod tidy on all modules.
+ @for mod in $(MODULES); do \
+ echo "Tidying $$mod..."; \
+ (cd $$mod && go mod tidy); \
+ done
+ go mod tidy
diff --git a/README.md b/README.md
index b7bbfc818..fcaf95767 100644
--- a/README.md
+++ b/README.md
@@ -1,56 +1,169 @@
# NVIDIA Device API
-**The NVIDIA Device API allows you to query and manipulate the state of node-local resources (such as GPUs) in Kubernetes**. Unlike the cluster-wide Kubernetes API, the Device API operates exclusively at the node level.
+The NVIDIA Device API provides a Kubernetes-idiomatic Go SDK and Protobuf definitions for interacting with NVIDIA device resources.
-The core control plane is the Device API server and the gRPC API that it exposes. Node-level agents, local monitoring tools, and external components communicate with one another through this node-local Device API server rather than the central Kubernetes control plane.
+**Node-local GPU device state management for Kubernetes**
-NVIDIA provides a [client library](./pkg/client-go) for those looking to write applications using the Device API. This library allows you to query and manipulate node-local resources using standard Kubernetes interfaces. Alternatively, the API can be accessed directly via gRPC.
+The NVIDIA Device API provides a standardized gRPC interface for observing and managing GPU device states in Kubernetes environments. It enables coordination between:
+
+- **Providers** (health monitors like NVSentinel, DCGM) that detect GPU health issues
+- **Consumers** (device plugins, DRA drivers) that need GPU health status for scheduling
+
+## Overview
+
+The Device API Server is a pure Go gRPC server with no hardware dependencies.
+GPU enumeration and health monitoring is provided by external providers (sidecars).
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ GPU Node │
+│ │
+│ ┌─────────────────────────────────────────────────────────┐│
+│ │ Device API Server (DaemonSet) ││
+│ │ ││
+│ │ ┌─────────────────────────────────────────────────┐ ││
+│ │ │ GpuService (unified) │ ││
+│ │ │ Read: GetGpu, ListGpus, WatchGpus │ ││
+│ │ │ Write: CreateGpu, UpdateGpuStatus, DeleteGpu │ ││
+│ │ └────────────────────┬────────────────────────────┘ ││
+│ │ ▼ ││
+│ │ ┌──────────────────────────────────────────────────┐ ││
+│ │ │ GPU Cache (RWMutex) │ ││
+│ │ └──────────────────────────────────────────────────┘ ││
+│ └─────────────────────────────────────────────────────────┘│
+│ │
+│ Providers (gRPC clients): │
+│ ├── nvml-provider sidecar ─► CreateGpu, UpdateGpuStatus │
+│ ├── NVSentinel ────────────► CreateGpu, UpdateGpuStatus │
+│ └── Custom providers ──────► CreateGpu, UpdateGpuStatus │
+│ │
+│ Consumers (gRPC clients): │
+│ ├── Device Plugins ────────► GetGpu, ListGpus, WatchGpus │
+│ └── DRA Drivers ───────────► GetGpu, ListGpus, WatchGpus │
+└─────────────────────────────────────────────────────────────┘
+```
+
+## Key Features
+
+- **Pure Go server**: No hardware dependencies; providers run as separate sidecars
+- **Read-blocking semantics**: Consumer reads block during provider updates to prevent stale data
+- **Multiple provider support**: Aggregate health status from NVSentinel, DCGM, or custom providers
+- **Watch streams**: Real-time GPU state change notifications
+- **Prometheus metrics**: Full observability with alerting rules
+- **Helm chart**: Production-ready Kubernetes deployment
+
+## Repository Structure
+
+| Module | Description |
+| :--- | :--- |
+| [`api/`](./api) | Protobuf definitions and Go types for the Device API. |
+| [`client-go/`](./client-go) | Kubernetes-style generated clients, informers, and listers. |
+| [`code-generator/`](./code-generator) | Tools for generating NVIDIA-specific client logic. |
+| [`cmd/device-api-server/`](./cmd/device-api-server) | Device API Server binary |
+| [`pkg/deviceapiserver/`](./pkg/deviceapiserver) | Server implementation |
+| [`charts/`](./charts) | Helm chart for Kubernetes deployment |
---
## Quick Start
+### Deploy Device API Server
+
+```bash
+# Install with Helm
+helm install device-api-server ./deployments/helm/device-api-server \
+ --namespace device-api --create-namespace
+```
+
+For GPU enumeration and health monitoring, deploy the nvml-provider sidecar.
+See the [nvml-sidecar demo](demos/nvml-sidecar-demo.sh) for an example deployment.
+
+### Using the Go Client
+
+```bash
+go get github.com/nvidia/device-api/api@latest
+```
+
```go
import (
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "github.com/nvidia/nvsentinel/pkg/client-go/clientset/versioned"
- "github.com/nvidia/nvsentinel/pkg/grpc/client"
+ v1alpha1 "github.com/nvidia/device-api/api/gen/go/device/v1alpha1"
+)
+```
+
+### Example: List GPUs
+
+```go
+package main
+
+import (
+ "context"
+ "log"
+
+ v1alpha1 "github.com/nvidia/device-api/api/gen/go/device/v1alpha1"
+ "google.golang.org/grpc"
+ "google.golang.org/grpc/credentials/insecure"
)
func main() {
- ctx := context.Background()
+ // Connect via Unix socket (recommended for node-local access)
+ conn, err := grpc.NewClient(
+ "unix:///var/run/device-api/device.sock",
+ grpc.WithTransportCredentials(insecure.NewCredentials()),
+ )
+ if err != nil {
+ log.Fatalf("failed to connect: %v", err)
+ }
+ defer conn.Close()
- // Connect to the local node's Device API server
- config := &client.Config{Target: "unix:///var/run/nvidia-device-api/device-api.sock"}
- clientset := versioned.NewForConfigOrDie(config)
+ client := v1alpha1.NewGpuServiceClient(conn)
- // Standard Kubernetes-style List call
- gpus, err := clientset.DeviceV1alpha1().GPUs().List(ctx, metav1.ListOptions{})
+ // List all GPUs
+ resp, err := client.ListGpus(context.Background(), &v1alpha1.ListGpusRequest{})
if err != nil {
- panic(err)
+ log.Fatalf("failed to list GPUs: %v", err)
+ }
+
+ for _, gpu := range resp.GpuList.Items {
+ log.Printf("GPU: %s (UUID: %s)", gpu.Name, gpu.Spec.Uuid)
+ for _, cond := range gpu.Status.Conditions {
+ log.Printf(" %s: %s (%s)", cond.Type, cond.Status, cond.Reason)
+ }
}
}
```
-See [examples](./examples) for additional details.
+### Using grpcurl
----
+```bash
+# List GPUs
+grpcurl -plaintext localhost:50051 nvidia.device.v1alpha1.GpuService/ListGpus
+
+# Watch for changes
+grpcurl -plaintext localhost:50051 nvidia.device.v1alpha1.GpuService/WatchGpus
+```
-## Components
+## API Overview
-### Device API Server
-The `device-apiserver` is a node-local control plane for NVIDIA devices.
+### GpuService
-**Running the server**:
-```bash
-# Build the binary
-make build
+The unified `GpuService` follows Kubernetes API conventions with standard CRUD methods:
-# Start the server with a local database
-./bin/device-apiserver \
- --bind-address="unix:///var/run/nvidia-device-api/device-api.sock" \
- --datastore-endpoint="sqlite:///var/lib/nvidia-device-api/state.db"
-```
+**Read Operations** (for consumers like device plugins and DRA drivers):
+
+| Method | Description |
+|--------|-------------|
+| `GetGpu` | Retrieves a single GPU resource by its unique name |
+| `ListGpus` | Retrieves a list of all GPU resources |
+| `WatchGpus` | Streams lifecycle events (ADDED, MODIFIED, DELETED) for GPU resources |
+
+**Write Operations** (for providers like health monitors):
+
+| Method | Description |
+|--------|-------------|
+| `CreateGpu` | Register a new GPU with the server |
+| `UpdateGpu` | Replace entire GPU resource |
+| `UpdateGpuStatus` | Update GPU status only (acquires write lock) |
+| `DeleteGpu` | Remove a GPU from the server |
---
@@ -58,29 +171,60 @@ make build
### Prerequisites
-* **Go**: `v1.25+`
-* **Protoc**: Required for protobuf generation.
-* **Make**
+- **Go**: `v1.25+`
+- **Protoc**: Required for protobuf generation
+- **golangci-lint**: Required for code quality checks
+- **Make**: Used for orchestrating build and generation tasks
+- **Helm 3.0+**: For chart development
-### Workflow
-The project utilizes a unified generation pipeline. **Avoid editing generated files directly**. If Protobuf definitions (`.proto`) or Go types (`_types.go`) are modified, run the following commands to synchronize the repository:
+### Build
```bash
-# Sync all gRPC bindings, DeepCopy/Conversion methods, Clients, and Server
+# Build everything
+make build
+
+# Build server only
+make build-server
+
+# Generate protobuf code
make code-gen
+```
-# Run tests
+### Test
+
+```bash
+# Run all tests
make test
-# Verify code quality
-make lint
+# Run server tests only
+make test-server
+```
-# Optional: Run integration tests
-make test-integration
+### Lint
+
+```bash
+make lint
```
---
+## Documentation
+
+- **[API Reference](docs/api/device-api-server.md)** - Complete gRPC API documentation
+- **[Operations Guide](docs/operations/device-api-server.md)** - Deployment, configuration, monitoring
+- **[Helm Chart](deployments/helm/device-api-server/README.md)** - Chart configuration reference
+- **[Design Documents](docs/design/)** - Architecture and design decisions
+
+The `client-go` module includes several examples for how to use the generated clients:
+
+* **Standard Client**: Basic CRUD operations.
+* **Shared Informers**: High-performance caching for controllers.
+* **Watch**: Real-time event streaming via gRPC.
+
+See the [examples](./client-go/examples) directory for details.
+
+---
+
## Contributing
We welcome contributions! Please see:
diff --git a/api/device/v1alpha1/converter.go b/api/device/v1alpha1/converter.go
index ff649f992..14b11b5e0 100644
--- a/api/device/v1alpha1/converter.go
+++ b/api/device/v1alpha1/converter.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -54,6 +54,17 @@ type Converter interface {
// FromProtobufObjectMeta converts a protobuf ObjectMeta into a metav1.ObjectMeta object.
//
+ // The following fields are intentionally excluded from the proto API:
+ // - DeletionTimestamp/GracePeriodSeconds: Managed by server-side deletion logic
+ // - Labels/Annotations: Not needed for device-level proto API; K8s controllers
+ // should use the native K8s API for label/annotation management
+ // - OwnerReferences/Finalizers: Not exposed in proto to prevent external
+ // controllers from creating dependency chains via the device API
+ // - ManagedFields/SelfLink: Server-managed metadata, not user-facing
+ //
+ // If labels/annotations support is needed in the future, add them to the
+ // proto ObjectMeta definition and remove the goverter:ignore directives.
+ //
// goverter:map Uid UID
// goverter:ignore GenerateName DeletionTimestamp DeletionGracePeriodSeconds
// goverter:ignore Labels Annotations OwnerReferences Finalizers ManagedFields SelfLink
diff --git a/api/device/v1alpha1/gpu_types.go b/api/device/v1alpha1/gpu_types.go
index e551b85a9..704bea40e 100644
--- a/api/device/v1alpha1/gpu_types.go
+++ b/api/device/v1alpha1/gpu_types.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -48,8 +48,7 @@ type GPUStatus struct {
//
// +genclient
// +genclient:nonNamespaced
-// +genclient:onlyVerbs=get,list,watch,create,update,delete
-// +genclient:noStatus
+// +genclient:onlyVerbs=get,list,watch,create,update,updateStatus,delete
// +k8s:deepcopy-gen=true
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
type GPU struct {
diff --git a/api/device/v1alpha1/zz_generated.deepcopy.go b/api/device/v1alpha1/zz_generated.deepcopy.go
index 0c399eb3e..f5cf44cb4 100644
--- a/api/device/v1alpha1/zz_generated.deepcopy.go
+++ b/api/device/v1alpha1/zz_generated.deepcopy.go
@@ -1,7 +1,7 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/api/proto/device/v1alpha1/gpu.proto b/api/proto/device/v1alpha1/gpu.proto
index 2641c415e..88577a9c6 100644
--- a/api/proto/device/v1alpha1/gpu.proto
+++ b/api/proto/device/v1alpha1/gpu.proto
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -194,6 +194,9 @@ service GpuService {
// UpdateGpu updates a single GPU resource.
rpc UpdateGpu(UpdateGpuRequest) returns (Gpu);
+ // UpdateGpuStatus updates only the status subresource of a GPU.
+ rpc UpdateGpuStatus(UpdateGpuStatusRequest) returns (Gpu);
+
// DeleteGpu deletes a single GPU resource.
rpc DeleteGpu(DeleteGpuRequest) returns (google.protobuf.Empty);
}
@@ -289,6 +292,18 @@ message UpdateGpuRequest {
UpdateOptions opts = 2;
}
+// UpdateGpuStatusRequest specifies the GPU whose status should be updated.
+// Only metadata (name, namespace, resource_version) and status fields are used.
+message UpdateGpuStatusRequest {
+ // gpu is the GPU resource with updated status.
+ // The server reads metadata.name, metadata.namespace, metadata.resource_version
+ // and status from this object. All other fields are ignored.
+ Gpu gpu = 1;
+
+ // opts contains the options for the update.
+ UpdateOptions opts = 2;
+}
+
message DeleteGpuRequest {
// The unique resource name of the GPU to delete.
string name = 1;
diff --git a/cmd/device-api-server/main.go b/cmd/device-api-server/main.go
new file mode 100644
index 000000000..91f61b039
--- /dev/null
+++ b/cmd/device-api-server/main.go
@@ -0,0 +1,186 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package main implements the Device API Server.
+//
+// The Device API Server is a node-local gRPC cache server deployed as a
+// Kubernetes DaemonSet. It acts as an intermediary between providers
+// (health monitors) that update GPU device states and consumers
+// (device plugins, DRA drivers) that read device states.
+//
+// Key features:
+// - Read-blocking semantics: Reads are blocked during provider updates
+// to prevent consumers from reading stale data
+// - Multiple provider support: Multiple health monitors can update
+// different conditions on the same GPUs
+// - Multiple consumer support: Device plugins, DRA drivers, and other
+// consumers can read and watch GPU states
+// - Observability: Prometheus metrics, structured logging with klog/v2
+package main
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "os"
+ "os/signal"
+ "syscall"
+
+ "github.com/spf13/pflag"
+ "golang.org/x/sync/errgroup"
+ cliflag "k8s.io/component-base/cli/flag"
+ "k8s.io/klog/v2"
+
+ "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver"
+ "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/options"
+ "github.com/nvidia/nvsentinel/pkg/storage/storagebackend"
+ "github.com/nvidia/nvsentinel/pkg/version"
+
+ // Import service providers so their init() functions register them.
+ _ "github.com/nvidia/nvsentinel/pkg/services/device/v1alpha1"
+)
+
+const (
+ // ComponentName is the name of this component for logging.
+ ComponentName = "device-api-server"
+)
+
+func main() {
+ opts := options.NewOptions()
+
+ fss := cliflag.NamedFlagSets{}
+ opts.AddFlags(&fss)
+
+ // Add a version flag to the global flag set.
+ showVersion := pflag.Bool("version", false, "Show version and exit")
+
+ // Merge all named flag sets into the global pflag command line.
+ for _, fs := range fss.FlagSets {
+ pflag.CommandLine.AddFlagSet(fs)
+ }
+
+ pflag.Parse()
+
+ // Handle version flag before any other initialization.
+ if *showVersion {
+ v := version.Get()
+ enc := json.NewEncoder(os.Stdout)
+ enc.SetIndent("", " ")
+ if err := enc.Encode(v); err != nil {
+ fmt.Fprintf(os.Stderr, "Failed to encode version: %v\n", err)
+ os.Exit(1)
+ }
+ os.Exit(0)
+ }
+
+ // Set up signal handling for graceful shutdown.
+ ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
+ defer cancel()
+
+ // Complete fills in defaults and resolves environment overrides.
+ completedOpts, err := opts.Complete(ctx)
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "Failed to complete options: %v\n", err)
+ os.Exit(1)
+ }
+
+ // Validate rejects invalid flag combinations.
+ if errs := completedOpts.Validate(); len(errs) > 0 {
+ for _, e := range errs {
+ fmt.Fprintf(os.Stderr, "Invalid configuration: %v\n", e)
+ }
+ os.Exit(1)
+ }
+
+ // Create root logger with component name.
+ logger := klog.Background().WithName(ComponentName)
+ ctx = klog.NewContext(ctx, logger)
+
+ versionInfo := version.Get()
+ logger.Info("Starting server",
+ "version", versionInfo.Version,
+ "commit", versionInfo.GitCommit,
+ "buildDate", versionInfo.BuildDate,
+ )
+
+ // Build the apiserver configuration from completed options.
+ apiserverConfig, err := apiserver.NewConfig(ctx, completedOpts)
+ if err != nil {
+ logger.Error(err, "Failed to create apiserver config")
+ os.Exit(1)
+ }
+
+ completedAPIServerConfig, err := apiserverConfig.Complete()
+ if err != nil {
+ logger.Error(err, "Failed to complete apiserver config")
+ os.Exit(1)
+ }
+
+ // Build the storage backend configuration from completed options.
+ storageConfig, err := storagebackend.NewConfig(ctx, completedOpts.Storage)
+ if err != nil {
+ logger.Error(err, "Failed to create storage config")
+ os.Exit(1)
+ }
+
+ completedStorageConfig, err := storageConfig.Complete()
+ if err != nil {
+ logger.Error(err, "Failed to complete storage config")
+ os.Exit(1)
+ }
+
+ storage, err := completedStorageConfig.New()
+ if err != nil {
+ logger.Error(err, "Failed to create storage backend")
+ os.Exit(1)
+ }
+
+ preparedStorage, err := storage.PrepareRun(ctx)
+ if err != nil {
+ logger.Error(err, "Failed to prepare storage backend")
+ os.Exit(1)
+ }
+
+ // Create, prepare the device API server before starting the run loop.
+ server, err := completedAPIServerConfig.New(storage)
+ if err != nil {
+ logger.Error(err, "Failed to create device API server")
+ os.Exit(1)
+ }
+
+ prepared, err := server.PrepareRun(ctx)
+ if err != nil {
+ logger.Error(err, "Failed to prepare device API server")
+ os.Exit(1)
+ }
+
+ // Run storage and server concurrently. If either fails, the errgroup
+ // cancels the shared context so the other component shuts down.
+ g, gctx := errgroup.WithContext(ctx)
+
+ g.Go(func() error {
+ return preparedStorage.Run(gctx)
+ })
+
+ g.Go(func() error {
+ return prepared.Run(gctx)
+ })
+
+ if err := g.Wait(); err != nil {
+ logger.Error(err, "Server error")
+ os.Exit(1)
+ }
+
+ logger.Info("Server stopped gracefully")
+}
diff --git a/cmd/device-apiserver/apiserver.go b/cmd/device-apiserver/apiserver.go
deleted file mode 100644
index 3d2f8352a..000000000
--- a/cmd/device-apiserver/apiserver.go
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
- "os"
-
- "k8s.io/component-base/cli"
-
- "github.com/nvidia/nvsentinel/cmd/device-apiserver/app"
-)
-
-func main() {
- command := app.NewAPIServerCommand()
- code := cli.Run(command)
- os.Exit(code)
-}
diff --git a/cmd/device-apiserver/app/config.go b/cmd/device-apiserver/app/config.go
deleted file mode 100644
index 520b4c0c2..000000000
--- a/cmd/device-apiserver/app/config.go
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package app
-
-import (
- "context"
-
- "github.com/nvidia/nvsentinel/cmd/device-apiserver/app/options"
- controlplane "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver"
- "github.com/nvidia/nvsentinel/pkg/storage/storagebackend"
-)
-
-type Config struct {
- Options options.CompletedOptions
-
- Storage *storagebackend.Config
- APIs *controlplane.Config
-}
-
-type completedConfig struct {
- Options options.CompletedOptions
-
- Storage storagebackend.CompletedConfig
- APIs controlplane.CompletedConfig
-}
-
-type CompletedConfig struct {
- *completedConfig
-}
-
-func NewConfig(ctx context.Context, opts options.CompletedOptions) (*Config, error) {
- c := &Config{
- Options: opts,
- }
-
- storageConfig, err := storagebackend.NewConfig(ctx, opts.Storage)
- if err != nil {
- return nil, err
- }
-
- c.Storage = storageConfig
-
- controlPlaneConfig, err := controlplane.NewConfig(ctx, opts.CompletedOptions)
- if err != nil {
- return nil, err
- }
-
- c.APIs = controlPlaneConfig
-
- return c, nil
-}
-
-func (c *Config) Complete() (CompletedConfig, error) {
- if c == nil || c.Storage == nil || c.APIs == nil {
- return CompletedConfig{}, nil
- }
-
- completedStorage, err := c.Storage.Complete()
- if err != nil {
- return CompletedConfig{}, err
- }
-
- completedAPIs, err := c.APIs.Complete()
- if err != nil {
- return CompletedConfig{}, err
- }
-
- return CompletedConfig{&completedConfig{
- Options: c.Options,
-
- Storage: completedStorage,
- APIs: completedAPIs,
- }}, nil
-}
diff --git a/cmd/device-apiserver/app/config_test.go b/cmd/device-apiserver/app/config_test.go
deleted file mode 100644
index a02d0ec64..000000000
--- a/cmd/device-apiserver/app/config_test.go
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package app
-
-import (
- "context"
- "testing"
-
- "github.com/nvidia/nvsentinel/cmd/device-apiserver/app/options"
-)
-
-func TestConfig(t *testing.T) {
- ctx, cancel := context.WithCancel(context.Background())
- defer cancel()
-
- opts := options.NewServerRunOptions()
-
- completedOpts, err := opts.Complete(ctx)
- if err != nil {
- t.Fatalf("Failed to complete options: %v", err)
- }
-
- cfg, err := NewConfig(ctx, completedOpts)
- if err != nil {
- t.Fatalf("NewConfig failed: %v", err)
- }
-
- if cfg.Storage == nil {
- t.Error("NewConfig did not initialize Storage config")
- }
- if cfg.APIs == nil {
- t.Error("NewConfig did not initialize APIs config")
- }
-
- t.Run("Complete", func(t *testing.T) {
- completedCfg, err := cfg.Complete()
- if err != nil {
- t.Fatalf("Complete failed: %v", err)
- }
-
- if completedCfg.completedConfig == nil {
- t.Fatal("CompletedConfig internal pointer is nil")
- }
-
- validationErrors := completedCfg.Options.Validate()
- if len(validationErrors) > 0 {
- t.Errorf("CompletedConfig is invalid: %v", validationErrors)
- }
- })
-
- t.Run("NilSafety", func(t *testing.T) {
- var nilCfg *Config
- _, err := nilCfg.Complete()
- if err != nil {
- t.Errorf("Complete() on nil config should not return error, got: %v", err)
- }
-
- partialCfg := &Config{}
- _, err = partialCfg.Complete()
- if err != nil {
- t.Errorf("Complete() on empty config should handle nil sub-fields gracefully, got: %v", err)
- }
- })
-}
diff --git a/cmd/device-apiserver/app/main_test.go b/cmd/device-apiserver/app/main_test.go
deleted file mode 100644
index b1f6de7de..000000000
--- a/cmd/device-apiserver/app/main_test.go
+++ /dev/null
@@ -1,11 +0,0 @@
-package app
-
-import (
- "testing"
-
- "github.com/nvidia/nvsentinel/pkg/util/testutils"
-)
-
-func TestMain(m *testing.M) {
- testutils.VerifyTestMain(m)
-}
diff --git a/cmd/device-apiserver/app/options/options.go b/cmd/device-apiserver/app/options/options.go
deleted file mode 100644
index 498edc89f..000000000
--- a/cmd/device-apiserver/app/options/options.go
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package options
-
-import (
- "context"
-
- cp "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/options"
- cliflag "k8s.io/component-base/cli/flag"
-)
-
-type ServerRunOptions struct {
- *cp.Options
-}
-
-type completedOptions struct {
- cp.CompletedOptions
-}
-
-type CompletedOptions struct {
- *completedOptions
-}
-
-func NewServerRunOptions() *ServerRunOptions {
- return &ServerRunOptions{
- Options: cp.NewOptions(),
- }
-}
-
-func (s *ServerRunOptions) Flags() cliflag.NamedFlagSets {
- fss := cliflag.NamedFlagSets{}
- if s == nil || s.Options == nil {
- return fss
- }
-
- s.AddFlags(&fss)
-
- return fss
-}
-
-func (o *ServerRunOptions) Complete(ctx context.Context) (CompletedOptions, error) {
- if o == nil {
- return CompletedOptions{completedOptions: &completedOptions{}}, nil
- }
-
- controlplane, err := o.Options.Complete(ctx)
- if err != nil {
- return CompletedOptions{}, err
- }
-
- completed := completedOptions{
- CompletedOptions: controlplane,
- }
-
- return CompletedOptions{
- completedOptions: &completed,
- }, nil
-}
-
-func (o completedOptions) Validate() []error {
- errs := o.CompletedOptions.Validate()
-
- return errs
-}
diff --git a/cmd/device-apiserver/app/options/options_test.go b/cmd/device-apiserver/app/options/options_test.go
deleted file mode 100644
index b81e5ac95..000000000
--- a/cmd/device-apiserver/app/options/options_test.go
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package options
-
-import (
- "context"
- "testing"
-)
-
-func TestServerRunOptions(t *testing.T) {
- opts := NewServerRunOptions()
- if opts == nil || opts.Options == nil {
- t.Fatal("NewServerRunOptions failed to initialize internal options")
- }
-
- fss := opts.Flags()
- if len(fss.FlagSets) == 0 {
- t.Error("Flags() returned empty NamedFlagSets; expected flags from internal options")
- }
-
- var nilOpts *ServerRunOptions
- nilFss := nilOpts.Flags()
- if len(nilFss.FlagSets) != 0 {
- t.Error("Flags() on nil options should return empty flag sets")
- }
-
- t.Run("CompleteAndValidate", func(t *testing.T) {
- ctx := context.Background()
-
- completed, err := opts.Complete(ctx)
- if err != nil {
- t.Fatalf("Complete failed: %v", err)
- }
-
- if completed.completedOptions == nil {
- t.Fatal("CompletedOptions internal pointer is nil")
- }
-
- errs := completed.Validate()
- if len(errs) > 0 {
- t.Logf("Note: Default validation returned %d errors (this is expected if defaults require setup)", len(errs))
- }
- })
-
- t.Run("CompleteNil", func(t *testing.T) {
- var nilOpts *ServerRunOptions
- completed, err := nilOpts.Complete(context.Background())
- if err != nil {
- t.Errorf("Complete() on nil options should not return error, got: %v", err)
- }
- if completed.completedOptions == nil {
- t.Error("Complete() on nil options should return a valid wrapper")
- }
- })
-}
diff --git a/cmd/device-apiserver/app/server.go b/cmd/device-apiserver/app/server.go
deleted file mode 100644
index be9165554..000000000
--- a/cmd/device-apiserver/app/server.go
+++ /dev/null
@@ -1,157 +0,0 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package app
-
-import (
- "context"
- "os"
-
- "github.com/nvidia/nvsentinel/cmd/device-apiserver/app/options"
- _ "github.com/nvidia/nvsentinel/pkg/services/device/v1alpha1"
- "github.com/nvidia/nvsentinel/pkg/util/verflag"
- utilversion "github.com/nvidia/nvsentinel/pkg/util/version"
- "github.com/spf13/cobra"
- "golang.org/x/sync/errgroup"
- utilerrors "k8s.io/apimachinery/pkg/util/errors"
- genericapiserver "k8s.io/apiserver/pkg/server"
- cliflag "k8s.io/component-base/cli/flag"
- "k8s.io/component-base/cli/globalflag"
- "k8s.io/component-base/logs"
- logsapi "k8s.io/component-base/logs/api/v1"
- "k8s.io/component-base/term"
- "k8s.io/klog/v2"
-)
-
-// NewAPIServerCommand creates a *cobra.Command object with default parameters
-func NewAPIServerCommand() *cobra.Command {
- s := options.NewServerRunOptions()
- ctx := genericapiserver.SetupSignalContext()
-
- cmd := &cobra.Command{
- Use: "device-apiserver",
- Long: `The Device API server validates and configures data
-for the api objects which include gpus and others. The API Server services
-gRPC operations and provides the frontend to a node's shared state through
-which all other node-local components interact.`,
-
- RunE: func(cmd *cobra.Command, args []string) error {
- verflag.PrintAndExitIfRequested()
-
- fs := cmd.Flags()
- // Activate logging as soon as possible, after that
- // show flags with the final logging configuration.
- logsapi.ReapplyHandling = logsapi.ReapplyHandlingIgnoreUnchanged
- if err := logsapi.ValidateAndApply(s.Logs, nil); err != nil {
- return err
- }
-
- cliflag.PrintFlags(fs)
-
- // set default options
- completedOptions, err := s.Complete(ctx)
- if err != nil {
- return err
- }
-
- // validate options
- if errs := completedOptions.Validate(); len(errs) != 0 {
- return utilerrors.NewAggregate(errs)
- }
-
- return Run(ctx, completedOptions)
- },
- Args: cobra.NoArgs,
- }
- cmd.SetContext(ctx)
-
- fs := cmd.Flags()
- namedFlagSets := s.Flags()
- verflag.AddFlags(namedFlagSets.FlagSet("global"))
- globalflag.AddGlobalFlags(namedFlagSets.FlagSet("global"), cmd.Name(), logs.SkipLoggingConfigurationFlags())
-
- for _, f := range namedFlagSets.FlagSets {
- fs.AddFlagSet(f)
- }
-
- cols, _, _ := term.TerminalSize(cmd.OutOrStdout())
- cliflag.SetUsageAndHelpFunc(cmd, namedFlagSets, cols)
-
- return cmd
-}
-
-// Run runs the specified APIServer. This should never exit.
-func Run(ctx context.Context, opts options.CompletedOptions) error {
- logger := klog.FromContext(ctx).WithValues("node", opts.NodeName)
- ctx = klog.NewContext(ctx, logger)
-
- logger.Info("Initializing Device API Server", "version", utilversion.Get())
- logger.V(2).Info("Golang settings",
- "GOGC", os.Getenv("GOGC"),
- "GOMAXPROCS", os.Getenv("GOMAXPROCS"),
- "GOTRACEBACK", os.Getenv("GOTRACEBACK"),
- )
-
- config, err := NewConfig(ctx, opts)
- if err != nil {
- return err
- }
-
- completed, err := config.Complete()
- if err != nil {
- return err
- }
-
- // Initialize and prepare storage to be injected into the server for readiness.
- storage, err := completed.Storage.New()
- if err != nil {
- return err
- }
-
- // Inject storage into the server to coordinate startup.
- server, err := completed.APIs.New(storage)
- if err != nil {
- return err
- }
-
- g, ctx := errgroup.WithContext(ctx)
-
- g.Go(func() error {
- preparedStorage, err := storage.PrepareRun(ctx)
- if err != nil {
- return err
- }
-
- return preparedStorage.Run(ctx)
- })
-
- g.Go(func() error {
- preparedServer, err := server.PrepareRun(ctx)
- if err != nil {
- return err
- }
-
- return preparedServer.Run(ctx)
- })
-
- err = g.Wait()
- if err != nil {
- logger.Error(err, "internal error: Device API Server exited with error")
- return err
- }
-
- logger.Info("Device API Server shut down gracefully")
-
- return nil
-}
diff --git a/cmd/device-apiserver/app/server_test.go b/cmd/device-apiserver/app/server_test.go
deleted file mode 100644
index a81dac2da..000000000
--- a/cmd/device-apiserver/app/server_test.go
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package app
-
-import (
- "context"
- "fmt"
- "os"
- "path/filepath"
- "strings"
- "testing"
- "time"
-
- "github.com/nvidia/nvsentinel/cmd/device-apiserver/app/options"
- "github.com/nvidia/nvsentinel/pkg/util/testutils"
-)
-
-func TestRun(t *testing.T) {
- opts := options.NewServerRunOptions()
-
- localSocket := testutils.NewUnixAddr(t)
- kineSocket := fmt.Sprintf("unix://%s", testutils.NewUnixAddr(t))
- healthAddr := testutils.GetFreeTCPAddress(t)
-
- opts.GRPC.BindAddress = "unix://" + localSocket
- opts.HealthAddress = healthAddr
- opts.NodeName = "test-node"
-
- tmpDir := t.TempDir()
- opts.Storage.DatabaseDir = tmpDir
- opts.Storage.DatabasePath = tmpDir + "state.db"
- opts.Storage.KineSocketPath = kineSocket
- opts.Storage.KineConfig.Endpoint = fmt.Sprintf("sqlite://%s/db.sqlite", tmpDir)
- opts.Storage.KineConfig.Listener = kineSocket
-
- ctx, cancel := context.WithCancel(context.Background())
- defer cancel()
-
- completedOpts, err := opts.Complete(ctx)
- if err != nil {
- t.Fatalf("Failed to complete options: %v", err)
- }
-
- errCh := make(chan error, 1)
- go func() {
- errCh <- Run(ctx, completedOpts)
- }()
-
- testutils.WaitForStatus(t, healthAddr, "", 5*time.Second, testutils.IsServing)
-
- cancel()
-
- select {
- case err := <-errCh:
- if err != nil && err != context.Canceled {
- t.Errorf("exited with unexpected error: %v", err)
- }
- case <-time.After(5 * time.Second):
- t.Fatal("Failed to shut down within grace period")
- }
-
- if _, err := os.Stat(localSocket); err == nil {
- t.Errorf("socket file %q still exists after shutdown", localSocket)
- }
-}
-
-func TestRun_StorageFailure(t *testing.T) {
- opts := options.NewServerRunOptions()
-
- tmpDir := t.TempDir()
- readOnlyDir := filepath.Join(tmpDir, "readonly")
- if err := os.Mkdir(readOnlyDir, 0444); err != nil {
- t.Fatal(err)
- }
-
- opts.NodeName = "test-node"
- opts.Storage.DatabaseDir = readOnlyDir
- opts.Storage.DatabasePath = readOnlyDir + "state.db"
- opts.Storage.KineSocketPath = filepath.Join(readOnlyDir, "kine.sock")
- opts.Storage.KineConfig.Endpoint = fmt.Sprintf("sqlite://%s/db.sqlite", readOnlyDir)
-
- opts.HealthAddress = testutils.GetFreeTCPAddress(t)
- opts.GRPC.BindAddress = "unix://" + filepath.Join(tmpDir, "api.sock")
-
- ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
- defer cancel()
-
- completedOpts, _ := opts.Complete(ctx)
-
- errCh := make(chan error, 1)
- go func() {
- errCh <- Run(ctx, completedOpts)
- }()
-
- select {
- case err := <-errCh:
- if err == nil {
- t.Error("Expected server to fail due to storage error, but it exited with nil")
- }
- if !strings.Contains(err.Error(), "storage") && !strings.Contains(err.Error(), "permission denied") {
- t.Errorf("Expected storage or permission error, got: %v", err)
- }
- case <-time.After(5 * time.Second):
- t.Fatal("Server should have failed immediately on storage error, but it timed out/hung")
- }
-}
diff --git a/cmd/nvml-provider/main.go b/cmd/nvml-provider/main.go
new file mode 100644
index 000000000..57ec0f835
--- /dev/null
+++ b/cmd/nvml-provider/main.go
@@ -0,0 +1,726 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build nvml
+
+// Command nvml-provider is a standalone NVML-based GPU health provider that
+// connects to a device-api-server instance via gRPC.
+//
+// This is designed to run as a sidecar container alongside device-api-server,
+// providing GPU enumeration and health monitoring via NVML.
+//
+// Usage:
+//
+// nvml-provider --server-address=localhost:9001 --driver-root=/run/nvidia/driver
+package main
+
+import (
+ "context"
+ "flag"
+ "fmt"
+ "net"
+ "net/http"
+ "os"
+ "os/signal"
+ "strings"
+ "sync"
+ "syscall"
+ "time"
+
+ "github.com/NVIDIA/go-nvml/pkg/nvml"
+ "google.golang.org/grpc"
+ "google.golang.org/grpc/credentials/insecure"
+ "google.golang.org/grpc/health/grpc_health_v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/klog/v2"
+
+ devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1"
+ clientset "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned"
+ gpuclient "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned/typed/device/v1alpha1"
+ nvmlpkg "github.com/nvidia/nvsentinel/pkg/providers/nvml"
+)
+
+const (
+ // DefaultProviderID is the default identifier for this provider.
+ DefaultProviderID = "nvml-provider-sidecar"
+
+ // HeartbeatInterval is how often to send heartbeats.
+ HeartbeatInterval = 10 * time.Second
+
+ // HealthCheckPort is the HTTP port for health checks.
+ HealthCheckPort = 8082
+
+ // EventTimeout is the timeout for NVML event wait (in milliseconds).
+ EventTimeout = 5000
+
+ // DefaultServerAddress is the default device-api-server address.
+ DefaultServerAddress = "localhost:9001"
+
+ // ConnectionRetryInterval is how long to wait between connection attempts.
+ ConnectionRetryInterval = 5 * time.Second
+
+ // MaxConnectionRetries is the maximum number of connection attempts.
+ MaxConnectionRetries = 60
+)
+
+// Config holds the provider configuration.
+type Config struct {
+ ServerAddress string
+ ProviderID string
+ DriverRoot string
+ HealthCheckEnabled bool
+ HealthCheckPort int
+ IgnoredXids []uint64
+}
+
+// DefaultConfig returns a Config with sensible defaults.
+func DefaultConfig() Config {
+ return Config{
+ ServerAddress: DefaultServerAddress,
+ ProviderID: DefaultProviderID,
+ DriverRoot: "/run/nvidia/driver",
+ HealthCheckEnabled: true,
+ HealthCheckPort: HealthCheckPort,
+ }
+}
+
+// Provider is the standalone NVML provider that connects to device-api-server.
+type Provider struct {
+ config Config
+ logger klog.Logger
+
+ // gRPC clients
+ conn *grpc.ClientConn
+ gpuClient gpuclient.GPUInterface
+ healthClient grpc_health_v1.HealthClient
+
+ // NVML
+ nvmllib nvml.Interface
+ eventSet nvml.EventSet
+
+ // State
+ mu sync.RWMutex
+ gpuUUIDs []string
+ initialized bool
+ connected bool
+ healthy bool
+ monitorRunning bool
+
+ // Lifecycle
+ ctx context.Context
+ cancel context.CancelFunc
+ wg sync.WaitGroup
+}
+
+// NewProvider creates a new standalone NVML provider.
+func NewProvider(cfg Config, logger klog.Logger) *Provider {
+ return &Provider{
+ config: cfg,
+ logger: logger.WithName("nvml-provider"),
+ }
+}
+
+func main() {
+ // Initialize logging flags first
+ klog.InitFlags(nil)
+
+ cfg := parseFlags()
+ // flag.Parse() is called inside parseFlags()
+
+ logger := klog.Background()
+ logger.Info("Starting NVML provider sidecar",
+ "serverAddress", cfg.ServerAddress,
+ "providerID", cfg.ProviderID,
+ "driverRoot", cfg.DriverRoot,
+ "healthCheckEnabled", cfg.HealthCheckEnabled,
+ )
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ // Handle signals
+ sigCh := make(chan os.Signal, 1)
+ signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
+ go func() {
+ sig := <-sigCh
+ logger.Info("Received signal, shutting down", "signal", sig)
+ cancel()
+ }()
+
+ // Create and run provider
+ provider := NewProvider(cfg, logger)
+ if err := provider.Run(ctx); err != nil {
+ logger.Error(err, "Provider failed")
+ os.Exit(1)
+ }
+
+ logger.Info("NVML provider shutdown complete")
+}
+
+func parseFlags() Config {
+ cfg := DefaultConfig()
+
+ flag.StringVar(&cfg.ServerAddress, "server-address", cfg.ServerAddress,
+ "Address of device-api-server gRPC endpoint")
+ flag.StringVar(&cfg.ProviderID, "provider-id", cfg.ProviderID,
+ "Unique identifier for this provider")
+ flag.StringVar(&cfg.DriverRoot, "driver-root", cfg.DriverRoot,
+ "Root path for NVIDIA driver libraries")
+ flag.BoolVar(&cfg.HealthCheckEnabled, "health-check", cfg.HealthCheckEnabled,
+ "Enable XID event monitoring for health checks")
+ flag.IntVar(&cfg.HealthCheckPort, "health-port", cfg.HealthCheckPort,
+ "HTTP port for health check endpoints")
+
+ // Parse flags
+ flag.Parse()
+
+ // Track which flags were explicitly set on the command line.
+ explicitFlags := make(map[string]bool)
+ flag.Visit(func(f *flag.Flag) {
+ explicitFlags[f.Name] = true
+ })
+
+ // Environment variables are used as fallback when the corresponding
+ // flag was not explicitly provided on the command line.
+ if !explicitFlags["server-address"] {
+ if addr := os.Getenv("PROVIDER_SERVER_ADDRESS"); addr != "" {
+ cfg.ServerAddress = addr
+ }
+ }
+ if !explicitFlags["provider-id"] {
+ if id := os.Getenv("PROVIDER_ID"); id != "" {
+ cfg.ProviderID = id
+ }
+ }
+ if !explicitFlags["driver-root"] {
+ // NVIDIA_DRIVER_ROOT follows the NVIDIA Container Toolkit convention.
+ // See: https://github.com/NVIDIA/nvidia-container-toolkit
+ if root := os.Getenv("NVIDIA_DRIVER_ROOT"); root != "" {
+ cfg.DriverRoot = root
+ }
+ }
+
+ return cfg
+}
+
+// Run starts the provider and blocks until the context is cancelled.
+func (p *Provider) Run(ctx context.Context) error {
+ p.ctx, p.cancel = context.WithCancel(ctx)
+ defer p.cancel()
+
+ // Start health check server
+ p.wg.Add(1)
+ go p.runHealthServer()
+
+ // Initialize NVML
+ if err := p.initNVML(); err != nil {
+ return fmt.Errorf("failed to initialize NVML: %w", err)
+ }
+ defer p.shutdownNVML()
+
+ // Connect to server with retry
+ if err := p.connectWithRetry(); err != nil {
+ return fmt.Errorf("failed to connect to server: %w", err)
+ }
+ defer p.disconnect()
+
+ // Enumerate and register GPUs (or reconcile if reconnecting)
+ if err := p.enumerateAndRegisterGPUs(); err != nil {
+ return fmt.Errorf("failed to enumerate GPUs: %w", err)
+ }
+
+ // Reconcile state (handles restart/reconnection scenarios)
+ if err := p.ReconcileState(p.ctx); err != nil {
+ // Reconciliation failure is not fatal - log and continue
+ p.logger.Error(err, "State reconciliation failed, continuing")
+ }
+
+ // Start heartbeat loop
+ p.wg.Add(1)
+ go p.runHeartbeatLoop()
+
+ // Start health monitoring if enabled
+ if p.config.HealthCheckEnabled && len(p.gpuUUIDs) > 0 {
+ p.wg.Add(1)
+ go p.runHealthMonitor()
+ }
+
+ // Mark as healthy
+ p.setHealthy(true)
+
+ // Wait for shutdown
+ <-p.ctx.Done()
+
+ // Graceful shutdown
+ p.setHealthy(false)
+ p.wg.Wait()
+
+ return nil
+}
+
+// initNVML initializes the NVML library.
+func (p *Provider) initNVML() error {
+ // Find NVML library
+ libraryPath := nvmlpkg.FindDriverLibrary(p.config.DriverRoot)
+ if libraryPath != "" {
+ p.logger.V(2).Info("Using NVML library", "path", libraryPath)
+ p.nvmllib = nvml.New(nvml.WithLibraryPath(libraryPath))
+ } else {
+ p.logger.V(2).Info("Using system default NVML library")
+ p.nvmllib = nvml.New()
+ }
+
+ // Initialize
+ ret := p.nvmllib.Init()
+ if ret != nvml.SUCCESS {
+ return fmt.Errorf("NVML init failed: %v", nvml.ErrorString(ret))
+ }
+
+ // Log driver version
+ if version, ret := p.nvmllib.SystemGetDriverVersion(); ret == nvml.SUCCESS {
+ p.logger.Info("NVML initialized", "driverVersion", version)
+ }
+
+ p.initialized = true
+ return nil
+}
+
+// shutdownNVML shuts down the NVML library.
+func (p *Provider) shutdownNVML() {
+ if !p.initialized {
+ return
+ }
+
+ if p.eventSet != nil {
+ p.eventSet.Free()
+ p.eventSet = nil
+ }
+
+ p.nvmllib.Shutdown()
+ p.initialized = false
+ p.logger.V(1).Info("NVML shutdown complete")
+}
+
+
+// isLocalhostAddress returns true if the address refers to the local machine.
+func isLocalhostAddress(addr string) bool {
+ // Unix socket paths are inherently local.
+ if strings.HasPrefix(addr, "unix://") || strings.HasPrefix(addr, "/") {
+ return true
+ }
+ host := addr
+ if h, _, err := net.SplitHostPort(addr); err == nil {
+ host = h
+ }
+ return host == "localhost" || host == "127.0.0.1" || host == "::1" || host == ""
+}
+
+// connectWithRetry connects to the device-api-server with retry logic.
+func (p *Provider) connectWithRetry() error {
+ // Validate that ServerAddress is localhost when using insecure credentials.
+ // This prevents accidental exposure of unencrypted gRPC traffic over the network.
+ if !isLocalhostAddress(p.config.ServerAddress) {
+ return fmt.Errorf("insecure credentials require localhost address, got %q; "+
+ "set --server-address to localhost: or use TLS", p.config.ServerAddress)
+ }
+
+ var lastErr error
+
+ for i := 0; i < MaxConnectionRetries; i++ {
+ select {
+ case <-p.ctx.Done():
+ return p.ctx.Err()
+ default:
+ }
+
+ // Insecure credentials are acceptable here: the provider connects to
+ // device-api-server via localhost within the same pod (sidecar pattern).
+ conn, err := grpc.NewClient(
+ p.config.ServerAddress,
+ grpc.WithTransportCredentials(insecure.NewCredentials()),
+ )
+ if err != nil {
+ lastErr = err
+ p.logger.V(1).Info("Connection attempt failed, retrying",
+ "attempt", i+1,
+ "error", err,
+ )
+ time.Sleep(ConnectionRetryInterval)
+ continue
+ }
+
+ p.conn = conn
+ cs := clientset.New(conn)
+ p.gpuClient = cs.DeviceV1alpha1().GPUs()
+ p.healthClient = grpc_health_v1.NewHealthClient(conn)
+
+ // Wait for server to be ready
+ if err := p.waitForServerReady(); err != nil {
+ conn.Close()
+ lastErr = err
+ p.logger.V(1).Info("Server not ready, retrying",
+ "attempt", i+1,
+ "error", err,
+ )
+ time.Sleep(ConnectionRetryInterval)
+ continue
+ }
+
+ p.connected = true
+ p.logger.Info("Connected to device-api-server", "address", p.config.ServerAddress)
+ return nil
+ }
+
+ return fmt.Errorf("failed to connect after %d attempts: %w", MaxConnectionRetries, lastErr)
+}
+
+// waitForServerReady waits for the server to report healthy.
+func (p *Provider) waitForServerReady() error {
+ ctx, cancel := context.WithTimeout(p.ctx, 5*time.Second)
+ defer cancel()
+
+ resp, err := p.healthClient.Check(ctx, &grpc_health_v1.HealthCheckRequest{})
+ if err != nil {
+ return fmt.Errorf("health check failed: %w", err)
+ }
+
+ if resp.Status != grpc_health_v1.HealthCheckResponse_SERVING {
+ return fmt.Errorf("server not serving: %v", resp.Status)
+ }
+
+ return nil
+}
+
+// disconnect closes the gRPC connection.
+func (p *Provider) disconnect() {
+ if p.conn != nil {
+ p.conn.Close()
+ p.conn = nil
+ }
+ p.connected = false
+}
+
+// enumerateAndRegisterGPUs discovers GPUs via NVML and registers them.
+func (p *Provider) enumerateAndRegisterGPUs() error {
+ count, ret := p.nvmllib.DeviceGetCount()
+ if ret != nvml.SUCCESS {
+ return fmt.Errorf("failed to get device count: %v", nvml.ErrorString(ret))
+ }
+
+ if count == 0 {
+ p.logger.Info("No GPUs found on this node")
+ return nil
+ }
+
+ p.logger.Info("Enumerating GPUs", "count", count)
+ uuids := make([]string, 0, count)
+
+ for i := 0; i < count; i++ {
+ device, ret := p.nvmllib.DeviceGetHandleByIndex(i)
+ if ret != nvml.SUCCESS {
+ p.logger.Error(nil, "Failed to get device handle", "index", i, "error", nvml.ErrorString(ret))
+ continue
+ }
+
+ uuid, ret := device.GetUUID()
+ if ret != nvml.SUCCESS {
+ p.logger.Error(nil, "Failed to get device UUID", "index", i, "error", nvml.ErrorString(ret))
+ continue
+ }
+
+ // Get device info for registration
+ productName, _ := device.GetName()
+ var memoryBytes uint64
+ if memInfo, ret := device.GetMemoryInfo(); ret == nvml.SUCCESS {
+ memoryBytes = memInfo.Total
+ }
+
+ // Register GPU with server
+ if err := p.registerGPU(uuid, productName, memoryBytes); err != nil {
+ p.logger.Error(err, "Failed to register GPU", "uuid", uuid)
+ continue
+ }
+
+ uuids = append(uuids, uuid)
+ p.logger.Info("Registered GPU",
+ "uuid", uuid,
+ "productName", productName,
+ "memory", nvmlpkg.FormatBytes(memoryBytes),
+ )
+ }
+
+ p.mu.Lock()
+ p.gpuUUIDs = uuids
+ p.mu.Unlock()
+
+ p.logger.Info("GPU enumeration complete", "registered", len(uuids))
+ return nil
+}
+
+// registerGPU registers a single GPU with the device-api-server using Create.
+func (p *Provider) registerGPU(uuid, productName string, memoryBytes uint64) error {
+ ctx, cancel := context.WithTimeout(p.ctx, 5*time.Second)
+ defer cancel()
+
+ gpu := &devicev1alpha1.GPU{
+ ObjectMeta: metav1.ObjectMeta{Name: uuid},
+ Spec: devicev1alpha1.GPUSpec{UUID: uuid},
+ Status: devicev1alpha1.GPUStatus{
+ Conditions: []metav1.Condition{
+ {
+ Type: nvmlpkg.ConditionTypeNVMLReady,
+ Status: metav1.ConditionStatus(nvmlpkg.ConditionStatusTrue),
+ Reason: "Initialized",
+ Message: fmt.Sprintf("GPU enumerated via NVML: %s (%s)", productName, nvmlpkg.FormatBytes(memoryBytes)),
+ LastTransitionTime: metav1.Now(),
+ },
+ },
+ },
+ }
+
+ _, err := p.gpuClient.Create(ctx, gpu, metav1.CreateOptions{})
+ return err
+}
+
+// runHeartbeatLoop sends periodic heartbeats to the server.
+func (p *Provider) runHeartbeatLoop() {
+ defer p.wg.Done()
+
+ ticker := time.NewTicker(HeartbeatInterval)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-p.ctx.Done():
+ return
+ case <-ticker.C:
+ if err := p.sendHeartbeat(); err != nil {
+ p.logger.Error(err, "Failed to send heartbeat")
+ }
+ }
+ }
+}
+
+// sendHeartbeat performs a health check on the server connection.
+// Note: The Heartbeat RPC was removed. We now just verify the server is reachable.
+func (p *Provider) sendHeartbeat() error {
+ ctx, cancel := context.WithTimeout(p.ctx, 5*time.Second)
+ defer cancel()
+
+ // Verify server connectivity by checking gRPC health
+ resp, err := p.healthClient.Check(ctx, &grpc_health_v1.HealthCheckRequest{})
+ if err != nil {
+ return err
+ }
+
+ if resp.Status != grpc_health_v1.HealthCheckResponse_SERVING {
+ return fmt.Errorf("server not serving: %v", resp.Status)
+ }
+
+ p.mu.RLock()
+ gpuCount := len(p.gpuUUIDs)
+ p.mu.RUnlock()
+
+ p.logger.V(4).Info("Health check passed", "gpuCount", gpuCount)
+ return nil
+}
+
+// runHealthMonitor monitors NVML events for GPU health changes.
+func (p *Provider) runHealthMonitor() {
+ defer p.wg.Done()
+
+ p.mu.Lock()
+ p.monitorRunning = true
+ p.mu.Unlock()
+
+ defer func() {
+ p.mu.Lock()
+ p.monitorRunning = false
+ p.mu.Unlock()
+ }()
+
+ // Create event set
+ eventSet, ret := p.nvmllib.EventSetCreate()
+ if ret != nvml.SUCCESS {
+ p.logger.Error(nil, "Failed to create event set", "error", nvml.ErrorString(ret))
+ return
+ }
+ defer eventSet.Free()
+ p.eventSet = eventSet
+
+ // Register devices for XID events
+ deviceCount, ret := p.nvmllib.DeviceGetCount()
+ if ret != nvml.SUCCESS {
+ p.logger.Error(nil, "Failed to get device count", "error", nvml.ErrorString(ret))
+ return
+ }
+
+ for i := 0; i < deviceCount; i++ {
+ device, ret := p.nvmllib.DeviceGetHandleByIndex(i)
+ if ret != nvml.SUCCESS {
+ continue
+ }
+ ret = device.RegisterEvents(nvml.EventTypeXidCriticalError|nvml.EventTypeSingleBitEccError|nvml.EventTypeDoubleBitEccError, eventSet)
+ if ret != nvml.SUCCESS {
+ p.logger.V(1).Info("Failed to register events for device", "index", i, "error", nvml.ErrorString(ret))
+ }
+ }
+
+ p.logger.Info("Health monitor started")
+
+ // Event loop
+ for {
+ select {
+ case <-p.ctx.Done():
+ return
+ default:
+ }
+
+ data, ret := eventSet.Wait(EventTimeout)
+ if ret == nvml.ERROR_TIMEOUT {
+ continue
+ }
+ if ret != nvml.SUCCESS {
+ p.logger.V(1).Info("Event wait error", "error", nvml.ErrorString(ret))
+ continue
+ }
+
+ p.handleXIDEvent(data)
+ }
+}
+
+// handleXIDEvent processes an XID error event.
+func (p *Provider) handleXIDEvent(data nvml.EventData) {
+ if data.Device == nil {
+ p.logger.Error(nil, "Received XID event with nil device handle")
+ return
+ }
+
+ uuid, ret := data.Device.GetUUID()
+ if ret != nvml.SUCCESS {
+ p.logger.Error(nil, "Failed to get device UUID from event")
+ return
+ }
+
+ xid := data.EventData
+ p.logger.Info("XID event received",
+ "uuid", uuid,
+ "xid", xid,
+ "eventType", data.EventType,
+ )
+
+ // Skip ignored XIDs (application-level errors, not hardware failures).
+ // This matches the in-process provider behavior in pkg/providers/nvml/health_monitor.go.
+ if nvmlpkg.IsDefaultIgnored(xid) {
+ p.logger.V(2).Info("Ignoring non-critical XID",
+ "uuid", uuid,
+ "xid", xid,
+ )
+ return
+ }
+
+ // Only critical XIDs trigger a health state change.
+ // Non-critical, non-ignored XIDs are logged but do not update GPU status,
+ // matching the in-process provider behavior in pkg/providers/nvml/health_monitor.go.
+ if !nvmlpkg.IsCriticalXid(xid) {
+ p.logger.V(2).Info("Non-critical XID, skipping status update",
+ "uuid", uuid,
+ "xid", xid,
+ )
+ return
+ }
+
+ p.logger.Info("Critical XID error detected",
+ "uuid", uuid,
+ "xid", xid,
+ )
+
+ ctx, cancel := context.WithTimeout(p.ctx, 5*time.Second)
+ defer cancel()
+
+ gpu := &devicev1alpha1.GPU{
+ ObjectMeta: metav1.ObjectMeta{Name: uuid},
+ Status: devicev1alpha1.GPUStatus{
+ Conditions: []metav1.Condition{
+ {
+ Type: nvmlpkg.ConditionTypeNVMLReady,
+ Status: metav1.ConditionStatus(nvmlpkg.ConditionStatusFalse),
+ Reason: "XIDError",
+ Message: fmt.Sprintf("Critical XID error: %d", xid),
+ LastTransitionTime: metav1.Now(),
+ },
+ },
+ },
+ }
+
+ if _, err := p.gpuClient.UpdateStatus(ctx, gpu, metav1.UpdateOptions{}); err != nil {
+ p.logger.Error(err, "Failed to update GPU status", "uuid", uuid)
+ }
+}
+
+// runHealthServer runs the HTTP health check server.
+func (p *Provider) runHealthServer() {
+ defer p.wg.Done()
+
+ mux := http.NewServeMux()
+ mux.HandleFunc("/healthz", p.handleHealthz)
+ mux.HandleFunc("/readyz", p.handleReadyz)
+ mux.HandleFunc("/livez", p.handleHealthz)
+
+ server := &http.Server{
+ Addr: fmt.Sprintf(":%d", p.config.HealthCheckPort),
+ Handler: mux,
+ ReadHeaderTimeout: 5 * time.Second,
+ ReadTimeout: 10 * time.Second,
+ WriteTimeout: 10 * time.Second,
+ }
+
+ go func() {
+ <-p.ctx.Done()
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ server.Shutdown(ctx)
+ }()
+
+ p.logger.Info("Health server started", "port", p.config.HealthCheckPort)
+ if err := server.ListenAndServe(); err != http.ErrServerClosed {
+ p.logger.Error(err, "Health server error")
+ }
+}
+
+func (p *Provider) handleHealthz(w http.ResponseWriter, _ *http.Request) {
+ w.WriteHeader(http.StatusOK)
+ w.Write([]byte("ok\n"))
+}
+
+func (p *Provider) handleReadyz(w http.ResponseWriter, _ *http.Request) {
+ p.mu.RLock()
+ healthy := p.healthy
+ p.mu.RUnlock()
+
+ if healthy {
+ w.WriteHeader(http.StatusOK)
+ w.Write([]byte("ok\n"))
+ } else {
+ w.WriteHeader(http.StatusServiceUnavailable)
+ w.Write([]byte("not ready\n"))
+ }
+}
+
+func (p *Provider) setHealthy(healthy bool) {
+ p.mu.Lock()
+ p.healthy = healthy
+ p.mu.Unlock()
+}
+
diff --git a/cmd/nvml-provider/reconciler.go b/cmd/nvml-provider/reconciler.go
new file mode 100644
index 000000000..af5f68b6c
--- /dev/null
+++ b/cmd/nvml-provider/reconciler.go
@@ -0,0 +1,308 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build nvml
+
+package main
+
+import (
+ "context"
+ "fmt"
+ "time"
+
+ "github.com/NVIDIA/go-nvml/pkg/nvml"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+ devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1"
+ nvmlpkg "github.com/nvidia/nvsentinel/pkg/providers/nvml"
+)
+
+// ReconcileState reconciles the provider's state with the device-api-server.
+//
+// This is called on startup and after reconnection to ensure:
+// 1. GPUs that were removed while disconnected are unregistered
+// 2. GPUs that were added while disconnected are registered
+// 3. GPU health states are reconciled with current NVML state
+//
+// This handles scenarios like:
+// - Provider crash and restart
+// - Network partition recovery
+// - GPU hotplug/removal during provider downtime
+func (p *Provider) ReconcileState(ctx context.Context) error {
+ p.logger.Info("Starting state reconciliation")
+
+ // Step 1: Get current state from server
+ cachedGPUs, err := p.listCachedGPUs(ctx)
+ if err != nil {
+ return fmt.Errorf("failed to list cached GPUs: %w", err)
+ }
+
+ p.logger.V(1).Info("Retrieved cached GPU state", "count", len(cachedGPUs))
+
+ // Step 2: Get current GPU UUIDs from NVML
+ currentUUIDs, err := p.getCurrentGPUUUIDs()
+ if err != nil {
+ return fmt.Errorf("failed to get current GPU UUIDs: %w", err)
+ }
+
+ p.logger.V(1).Info("Current GPUs from NVML", "count", len(currentUUIDs))
+
+ // Build lookup maps
+ cachedUUIDSet := make(map[string]*devicev1alpha1.GPU)
+ for i := range cachedGPUs {
+ gpu := &cachedGPUs[i]
+ cachedUUIDSet[gpu.Spec.UUID] = gpu
+ }
+
+ currentUUIDSet := make(map[string]bool)
+ for _, uuid := range currentUUIDs {
+ currentUUIDSet[uuid] = true
+ }
+
+ // Step 3: Find and unregister removed GPUs
+ for uuid := range cachedUUIDSet {
+ if !currentUUIDSet[uuid] {
+ p.logger.Info("GPU was removed, unregistering", "uuid", uuid)
+ if err := p.unregisterGPU(ctx, uuid); err != nil {
+ p.logger.Error(err, "Failed to unregister removed GPU", "uuid", uuid)
+ // Continue with other GPUs
+ }
+ }
+ }
+
+ // Step 4: Find and register new GPUs
+ for _, uuid := range currentUUIDs {
+ if _, exists := cachedUUIDSet[uuid]; !exists {
+ p.logger.Info("New GPU found, registering", "uuid", uuid)
+ if err := p.registerNewGPU(ctx, uuid); err != nil {
+ p.logger.Error(err, "Failed to register new GPU", "uuid", uuid)
+ // Continue with other GPUs
+ }
+ }
+ }
+
+ // Step 5: Reconcile health state for existing GPUs
+ for _, uuid := range currentUUIDs {
+ if cachedGPU, exists := cachedUUIDSet[uuid]; exists {
+ if err := p.reconcileGPUHealth(ctx, uuid, cachedGPU); err != nil {
+ p.logger.Error(err, "Failed to reconcile GPU health", "uuid", uuid)
+ // Continue with other GPUs
+ }
+ }
+ }
+
+ // Step 6: Update local GPU list
+ p.mu.Lock()
+ p.gpuUUIDs = currentUUIDs
+ p.mu.Unlock()
+
+ p.logger.Info("State reconciliation complete",
+ "totalGPUs", len(currentUUIDs),
+ )
+
+ return nil
+}
+
+// listCachedGPUs retrieves the list of GPUs from the server cache.
+//
+// Note: This lists ALL GPUs, not just those from this provider.
+// TODO: Add provider_id filtering to ListGpus RPC for efficiency.
+func (p *Provider) listCachedGPUs(ctx context.Context) ([]devicev1alpha1.GPU, error) {
+ // Note: If the parent context has a shorter deadline, WithTimeout
+ // inherits the parent's deadline. This is the correct behavior:
+ // reconciliation should respect the overall operation timeout.
+ ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
+ defer cancel()
+
+ gpuList, err := p.gpuClient.List(ctx, metav1.ListOptions{})
+ if err != nil {
+ return nil, err
+ }
+
+ // Filter to only GPUs that might belong to this provider
+ // For now, we assume all GPUs belong to us since we're the only provider
+ // A more robust solution would use provider_id filtering
+ return gpuList.Items, nil
+}
+
+// getCurrentGPUUUIDs gets the list of GPU UUIDs currently visible to NVML.
+func (p *Provider) getCurrentGPUUUIDs() ([]string, error) {
+ count, ret := p.nvmllib.DeviceGetCount()
+ if ret != nvml.SUCCESS {
+ return nil, fmt.Errorf("failed to get device count: %v", nvml.ErrorString(ret))
+ }
+
+ uuids := make([]string, 0, count)
+ for i := 0; i < count; i++ {
+ device, ret := p.nvmllib.DeviceGetHandleByIndex(i)
+ if ret != nvml.SUCCESS {
+ continue
+ }
+
+ uuid, ret := device.GetUUID()
+ if ret != nvml.SUCCESS {
+ continue
+ }
+
+ uuids = append(uuids, uuid)
+ }
+
+ return uuids, nil
+}
+
+// unregisterGPU removes a GPU from the server using Delete.
+func (p *Provider) unregisterGPU(ctx context.Context, uuid string) error {
+ // Note: If the parent context has a shorter deadline, WithTimeout
+ // inherits the parent's deadline. This is the correct behavior:
+ // reconciliation should respect the overall operation timeout.
+ ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
+ defer cancel()
+
+ return p.gpuClient.Delete(ctx, uuid, metav1.DeleteOptions{})
+}
+
+// registerNewGPU registers a newly discovered GPU.
+func (p *Provider) registerNewGPU(ctx context.Context, uuid string) error {
+ // Get device info from NVML
+ productName := "Unknown"
+ var memoryBytes uint64
+
+ // Find the device by UUID
+ count, ret := p.nvmllib.DeviceGetCount()
+ if ret == nvml.SUCCESS {
+ for i := 0; i < count; i++ {
+ device, ret := p.nvmllib.DeviceGetHandleByIndex(i)
+ if ret != nvml.SUCCESS {
+ continue
+ }
+ deviceUUID, ret := device.GetUUID()
+ if ret != nvml.SUCCESS || deviceUUID != uuid {
+ continue
+ }
+
+ // Found the device
+ if name, ret := device.GetName(); ret == nvml.SUCCESS {
+ productName = name
+ }
+ if memInfo, ret := device.GetMemoryInfo(); ret == nvml.SUCCESS {
+ memoryBytes = memInfo.Total
+ }
+ break
+ }
+ }
+
+ return p.registerGPU(uuid, productName, memoryBytes)
+}
+
+// reconcileGPUHealth compares cached health state with current NVML state.
+//
+// If the GPU was marked as Unknown (due to provider timeout) but is now
+// healthy per NVML, we update it back to healthy.
+func (p *Provider) reconcileGPUHealth(ctx context.Context, uuid string, cachedGPU *devicev1alpha1.GPU) error {
+ // Check if the cached state shows Unknown (from heartbeat timeout)
+ var cachedCondition *metav1.Condition
+ for i := range cachedGPU.Status.Conditions {
+ cond := &cachedGPU.Status.Conditions[i]
+ if cond.Type == "Ready" || cond.Type == nvmlpkg.ConditionTypeNVMLReady {
+ cachedCondition = cond
+ break
+ }
+ }
+
+ // If the condition is Unknown, query NVML and update if healthy
+ if cachedCondition != nil && string(cachedCondition.Status) == nvmlpkg.ConditionStatusUnknown {
+ p.logger.Info("GPU has Unknown status, checking current NVML state", "uuid", uuid)
+
+ // For now, if we can enumerate the GPU via NVML, consider it healthy
+ // A more sophisticated check would query specific health indicators
+ healthy, err := p.isGPUHealthy(uuid)
+ if err != nil {
+ return fmt.Errorf("failed to check GPU health: %w", err)
+ }
+
+ if healthy {
+ p.logger.Info("GPU is healthy per NVML, updating status", "uuid", uuid)
+ return p.updateGPUCondition(ctx, uuid, nvmlpkg.ConditionStatusTrue, "Recovered", "GPU recovered after provider reconnection")
+ }
+ }
+
+ return nil
+}
+
+// isGPUHealthy checks if a GPU is healthy via NVML.
+func (p *Provider) isGPUHealthy(uuid string) (bool, error) {
+ // Find device by UUID
+ count, ret := p.nvmllib.DeviceGetCount()
+ if ret != nvml.SUCCESS {
+ return false, fmt.Errorf("failed to get device count: %v", nvml.ErrorString(ret))
+ }
+
+ for i := 0; i < count; i++ {
+ device, ret := p.nvmllib.DeviceGetHandleByIndex(i)
+ if ret != nvml.SUCCESS {
+ continue
+ }
+ deviceUUID, ret := device.GetUUID()
+ if ret != nvml.SUCCESS || deviceUUID != uuid {
+ continue
+ }
+
+ // Device found - check basic health indicators
+ // 1. Can we get memory info? (basic liveness check)
+ if _, ret := device.GetMemoryInfo(); ret != nvml.SUCCESS {
+ return false, nil
+ }
+
+ // 2. Check for pending page retirements (ECC errors)
+ if pending, ret := device.GetRetiredPagesPendingStatus(); ret == nvml.SUCCESS {
+ if pending == nvml.FEATURE_ENABLED {
+ p.logger.V(1).Info("GPU has pending page retirements", "uuid", uuid)
+ return false, nil
+ }
+ }
+
+ // Device is accessible and no pending issues
+ return true, nil
+ }
+
+ // Device not found - not healthy
+ return false, nil
+}
+
+// updateGPUCondition updates a GPU's status via UpdateStatus.
+func (p *Provider) updateGPUCondition(ctx context.Context, uuid, status, reason, message string) error {
+ // Note: If the parent context has a shorter deadline, WithTimeout
+ // inherits the parent's deadline. This is the correct behavior:
+ // reconciliation should respect the overall operation timeout.
+ ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
+ defer cancel()
+
+ gpu := &devicev1alpha1.GPU{
+ ObjectMeta: metav1.ObjectMeta{Name: uuid},
+ Status: devicev1alpha1.GPUStatus{
+ Conditions: []metav1.Condition{
+ {
+ Type: nvmlpkg.ConditionTypeNVMLReady,
+ Status: metav1.ConditionStatus(status),
+ Reason: reason,
+ Message: message,
+ LastTransitionTime: metav1.Now(),
+ },
+ },
+ },
+ }
+
+ _, err := p.gpuClient.UpdateStatus(ctx, gpu, metav1.UpdateOptions{})
+ return err
+}
diff --git a/code-generator/cmd/client-gen/generators/generator_for_type.go b/code-generator/cmd/client-gen/generators/generator_for_type.go
index dc4a11bef..028a65658 100644
--- a/code-generator/cmd/client-gen/generators/generator_for_type.go
+++ b/code-generator/cmd/client-gen/generators/generator_for_type.go
@@ -15,7 +15,7 @@ limitations under the License.
*/
/*
-Portions Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved.
+Portions Copyright (c) 2026 NVIDIA CORPORATION. All rights reserved.
Modified from the original to support gRPC transport.
Origin: https://github.com/kubernetes/code-generator/blob/v0.34.1/cmd/client-gen/generators/generator_for_type.go
@@ -401,9 +401,24 @@ func (c *$.type|allLowercasePlural$) Update(ctx $.context|raw$, $.type|allLowerc
`
var updateStatusTemplate = `
-// TODO: Implement UpdateStatus support.
+// UpdateStatus updates only the status subresource of a $.type|public$.
func (c *$.type|allLowercasePlural$) UpdateStatus(ctx $.context|raw$, $.type|allLowercase$ *$.type|raw$, opts $.UpdateOptions|raw$) (*$.type|raw$, error) {
- return nil, $.fmtErrorf|raw$("UpdateStatus not implemented")
+ resp, err := c.client.Update$.ProtoType$Status(ctx, &$.pb$.Update$.ProtoType$StatusRequest{
+ $.ProtoType$: $.ToProto|raw$($.type|allLowercase$),
+ Opts: &$.pb$.UpdateOptions{},
+ })
+ if err != nil {
+ return nil, err
+ }
+
+ obj := $.FromProto|raw$(resp)
+ c.logger.V(2).Info("Updated $.type|public$ status",
+ "name", obj.GetName(),
+ "namespace", c.getNamespace(),
+ "resource-version", obj.GetResourceVersion(),
+ )
+
+ return obj, nil
}
`
diff --git a/demos/nvml-sidecar-demo.sh b/demos/nvml-sidecar-demo.sh
new file mode 100755
index 000000000..cb5ffe06d
--- /dev/null
+++ b/demos/nvml-sidecar-demo.sh
@@ -0,0 +1,752 @@
+#!/bin/bash
+# NVML Provider Sidecar Demo
+# Demonstrates the NVML provider sidecar architecture for GPU enumeration
+#
+# Prerequisites:
+# - kubectl configured with GPU cluster access
+# - docker with buildx for building images
+# - helm 3.x installed
+# - GPU nodes with RuntimeClass 'nvidia'
+#
+# Usage: ./demos/nvml-sidecar-demo.sh [kubeconfig]
+#
+# Environment Variables (all optional):
+# KUBECONFIG - Path to kubeconfig file (default: $HOME/.kube/config)
+# NAMESPACE - Kubernetes namespace (default: device-api)
+# RELEASE_NAME - Helm release name (default: device-api-server)
+# IMAGE_REGISTRY - Container registry (default: ttl.sh)
+# IMAGE_TAG - Image tag (default: 2h for ttl.sh expiry)
+# SERVER_IMAGE - Full device-api-server image (default: $IMAGE_REGISTRY/device-api-server:$IMAGE_TAG)
+# SIDECAR_IMAGE - Full sidecar image (default: $IMAGE_REGISTRY/device-api-server-sidecar:$IMAGE_TAG)
+# BUILD_PLATFORM - Target platform for builds (default: linux/amd64)
+# GPU_NODE_SELECTOR - Label selector for GPU nodes (default: nvidia.com/gpu.present=true)
+# CHART_PATH - Path to Helm chart (default: deployments/helm/device-api-server)
+# VALUES_FILE - Path to values file (default: deployments/helm/values-sidecar-test.yaml)
+# DOCKERFILE - Path to Dockerfile (default: deployments/container/Dockerfile)
+# APP_NAME - Helm chart app name for pod selectors (default: device-api-server)
+# CONTAINER_NAME - Main container name (default: device-api-server)
+# SIDECAR_CONTAINER_NAME - Sidecar container name (default: nvml-provider)
+# INTERACTIVE - Enable interactive mode with prompts (default: true)
+# SKIP_DESTRUCTIVE - Skip destructive ops in non-interactive mode (default: true)
+# SKIP_BUILD - Skip image building entirely (default: false)
+#
+# Examples:
+# # Use default settings with ttl.sh
+# ./demos/nvml-sidecar-demo.sh
+#
+# # Use custom kubeconfig
+# KUBECONFIG=~/.kube/config-aws-gpu ./demos/nvml-sidecar-demo.sh
+#
+# # Use custom registry
+# IMAGE_REGISTRY=ghcr.io/nvidia IMAGE_TAG=latest ./demos/nvml-sidecar-demo.sh
+#
+# # Non-interactive mode (for CI/automation)
+# INTERACTIVE=false KUBECONFIG=~/.kube/config ./demos/nvml-sidecar-demo.sh
+
+set -euo pipefail
+
+# ==============================================================================
+# Configuration (all values configurable via environment variables)
+# ==============================================================================
+
+# Kubernetes configuration
+KUBECONFIG="${KUBECONFIG:-${1:-$HOME/.kube/config}}"
+NAMESPACE="${NAMESPACE:-device-api}"
+RELEASE_NAME="${RELEASE_NAME:-device-api-server}"
+
+# Paths (relative to repo root)
+CHART_PATH="${CHART_PATH:-deployments/helm/device-api-server}"
+VALUES_FILE="${VALUES_FILE:-deployments/helm/values-sidecar-test.yaml}"
+DOCKERFILE="${DOCKERFILE:-deployments/container/Dockerfile}"
+
+# Image registry settings
+IMAGE_REGISTRY="${IMAGE_REGISTRY:-ttl.sh}"
+IMAGE_TAG="${IMAGE_TAG:-2h}"
+
+# Image names (using ttl.sh ephemeral registry by default - images expire based on tag)
+SERVER_IMAGE="${SERVER_IMAGE:-${IMAGE_REGISTRY}/device-api-server:${IMAGE_TAG}}"
+SIDECAR_IMAGE="${SIDECAR_IMAGE:-${IMAGE_REGISTRY}/device-api-server-sidecar:${IMAGE_TAG}}"
+
+# Build settings
+BUILD_PLATFORM="${BUILD_PLATFORM:-linux/amd64}"
+
+# Node selection (for listing GPU nodes)
+GPU_NODE_SELECTOR="${GPU_NODE_SELECTOR:-nvidia.com/gpu.present=true}"
+
+# Interactive mode (set to false for CI/automated runs)
+INTERACTIVE="${INTERACTIVE:-true}"
+
+# Skip destructive demos in non-interactive mode
+SKIP_DESTRUCTIVE="${SKIP_DESTRUCTIVE:-true}"
+
+# Skip image building entirely (use pre-built images)
+SKIP_BUILD="${SKIP_BUILD:-false}"
+
+# Helm chart app name (used for pod selectors and container names)
+APP_NAME="${APP_NAME:-device-api-server}"
+CONTAINER_NAME="${CONTAINER_NAME:-device-api-server}"
+SIDECAR_CONTAINER_NAME="${SIDECAR_CONTAINER_NAME:-nvml-provider}"
+
+# ==============================================================================
+# Terminal Colors (buildah-style)
+# ==============================================================================
+
+if [[ -t 1 ]]; then
+ red=$(tput setaf 1)
+ green=$(tput setaf 2)
+ yellow=$(tput setaf 3)
+ blue=$(tput setaf 4)
+ magenta=$(tput setaf 5)
+ cyan=$(tput setaf 6)
+ white=$(tput setaf 7)
+ bold=$(tput bold)
+ reset=$(tput sgr0)
+else
+ red=""
+ green=""
+ yellow=""
+ blue=""
+ magenta=""
+ cyan=""
+ white=""
+ bold=""
+ reset=""
+fi
+
+# ==============================================================================
+# Helper Functions
+# ==============================================================================
+
+banner() {
+ echo ""
+ echo "${bold}${blue}============================================================${reset}"
+ echo "${bold}${blue} $1${reset}"
+ echo "${bold}${blue}============================================================${reset}"
+ echo ""
+}
+
+step() {
+ echo ""
+ echo "${bold}${green}>>> $1${reset}"
+ echo ""
+}
+
+info() {
+ echo "${cyan} $1${reset}"
+}
+
+warn() {
+ echo "${yellow} WARNING: $1${reset}"
+}
+
+error() {
+ echo "${red} ERROR: $1${reset}"
+}
+
+run_cmd() {
+ echo "${magenta} \$ $*${reset}"
+ "$@"
+}
+
+pause() {
+ if [[ "${INTERACTIVE}" == "true" ]]; then
+ echo ""
+ read -r -p "${yellow}Press ENTER to continue...${reset}"
+ echo ""
+ fi
+}
+
+confirm() {
+ if [[ "${INTERACTIVE}" != "true" ]]; then
+ # Auto-confirm in non-interactive mode
+ info "Auto-confirming: $1"
+ return 0
+ fi
+ echo ""
+ read -r -p "${yellow}$1 [y/N] ${reset}" response
+ case "$response" in
+ [yY][eE][sS]|[yY]) return 0 ;;
+ *) return 1 ;;
+ esac
+}
+
+# Confirm for destructive operations (skipped in non-interactive mode if SKIP_DESTRUCTIVE=true)
+confirm_destructive() {
+ if [[ "${INTERACTIVE}" != "true" && "${SKIP_DESTRUCTIVE}" == "true" ]]; then
+ info "Skipping destructive operation in non-interactive mode: $1"
+ return 1
+ fi
+ confirm "$1"
+}
+
+check_prereqs() {
+ local missing=()
+
+ command -v kubectl &>/dev/null || missing+=("kubectl")
+ command -v helm &>/dev/null || missing+=("helm")
+ command -v docker &>/dev/null || missing+=("docker")
+
+ if [[ ${#missing[@]} -gt 0 ]]; then
+ error "Missing prerequisites: ${missing[*]}"
+ exit 1
+ fi
+
+ # Check for buildx (required for cross-platform builds)
+ if ! docker buildx version &>/dev/null; then
+ warn "docker buildx not available - cross-platform builds may fail"
+ warn "Run: docker buildx create --use --name multiarch"
+ else
+ info "Docker buildx: $(docker buildx version | head -1)"
+ fi
+}
+
+# ==============================================================================
+# Demo Sections
+# ==============================================================================
+
+show_intro() {
+ [[ "${INTERACTIVE}" == "true" ]] && clear
+ banner "NVML Provider Sidecar Architecture Demo"
+
+ echo "${white}This demo showcases the sidecar-based NVML provider for device-api-server.${reset}"
+ echo ""
+ echo "${white}Architecture:${reset}"
+ echo "${cyan} ┌─────────────────────────────────────────────────────────┐${reset}"
+ echo "${cyan} │ Pod │${reset}"
+ echo "${cyan} │ ┌──────────────────┐ ┌──────────────────┐ │${reset}"
+ echo "${cyan} │ │ device-api-server│ │ nvml-provider │ │${reset}"
+ echo "${cyan} │ │ (pure Go) │◄───│ (CGO + NVML) │ │${reset}"
+ echo "${cyan} │ │ Unix Socket │gRPC│ Health :8082 │ │${reset}"
+ echo "${cyan} │ │ Health :8081 │ │ RuntimeClass: │ │${reset}"
+ echo "${cyan} │ │ Metrics :9090 │ │ nvidia │ │${reset}"
+ echo "${cyan} │ └──────────────────┘ └──────────────────┘ │${reset}"
+ echo "${cyan} └─────────────────────────────────────────────────────────┘${reset}"
+ echo ""
+ echo "${white}Benefits:${reset}"
+ echo "${green} ✓ Separation of concerns (API server vs NVML access)${reset}"
+ echo "${green} ✓ Independent scaling and updates${reset}"
+ echo "${green} ✓ Better testability (mock providers)${reset}"
+ echo "${green} ✓ Crash isolation (NVML crashes don't kill API server)${reset}"
+ echo ""
+
+ pause
+}
+
+show_config() {
+ banner "Configuration"
+
+ echo "${white}Current settings (override via environment variables):${reset}"
+ echo ""
+ echo "${cyan} Kubernetes:${reset}"
+ echo " KUBECONFIG = ${KUBECONFIG}"
+ echo " NAMESPACE = ${NAMESPACE}"
+ echo " RELEASE_NAME = ${RELEASE_NAME}"
+ echo ""
+ echo "${cyan} Paths:${reset}"
+ echo " CHART_PATH = ${CHART_PATH}"
+ echo " VALUES_FILE = ${VALUES_FILE}"
+ echo " DOCKERFILE = ${DOCKERFILE}"
+ echo ""
+ echo "${cyan} Images:${reset}"
+ echo " IMAGE_REGISTRY = ${IMAGE_REGISTRY}"
+ echo " IMAGE_TAG = ${IMAGE_TAG}"
+ echo " SERVER_IMAGE = ${SERVER_IMAGE}"
+ echo " SIDECAR_IMAGE = ${SIDECAR_IMAGE}"
+ echo ""
+ echo "${cyan} Build:${reset}"
+ echo " BUILD_PLATFORM = ${BUILD_PLATFORM}"
+ echo ""
+ echo "${cyan} Cluster:${reset}"
+ echo " GPU_NODE_SELECTOR = ${GPU_NODE_SELECTOR}"
+ echo ""
+ echo "${cyan} Helm Chart:${reset}"
+ echo " APP_NAME = ${APP_NAME}"
+ echo " CONTAINER_NAME = ${CONTAINER_NAME}"
+ echo " SIDECAR_CONTAINER_NAME = ${SIDECAR_CONTAINER_NAME}"
+ echo ""
+ echo "${cyan} Mode:${reset}"
+ echo " INTERACTIVE = ${INTERACTIVE}"
+ echo " SKIP_DESTRUCTIVE = ${SKIP_DESTRUCTIVE}"
+ echo " SKIP_BUILD = ${SKIP_BUILD}"
+ echo ""
+
+ pause
+}
+
+show_cluster_info() {
+ banner "Step 1: Verify Cluster Connectivity"
+
+ step "Check cluster connection"
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" cluster-info
+
+ pause
+
+ step "List GPU nodes (selector: ${GPU_NODE_SELECTOR})"
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" get nodes -l "${GPU_NODE_SELECTOR}" -o wide || {
+ warn "No nodes found with selector '${GPU_NODE_SELECTOR}'"
+ info "Listing all nodes instead:"
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" get nodes -o wide
+ }
+
+ pause
+
+ step "Verify nvidia RuntimeClass exists"
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" get runtimeclass nvidia -o yaml || {
+ warn "RuntimeClass 'nvidia' not found. GPU access may not work."
+ }
+
+ pause
+}
+
+check_image_exists() {
+ local image="$1"
+ # Try to inspect the manifest - if it exists, the image is available
+ docker buildx imagetools inspect "${image}" &>/dev/null 2>&1
+}
+
+build_images() {
+ banner "Step 2: Build Container Images"
+
+ if [[ "${SKIP_BUILD}" == "true" ]]; then
+ info "SKIP_BUILD=true, skipping image builds"
+ info "Using pre-built images:"
+ info " SERVER_IMAGE: ${SERVER_IMAGE}"
+ info " SIDECAR_IMAGE: ${SIDECAR_IMAGE}"
+ return 0
+ fi
+
+ info "Building images for registry: ${IMAGE_REGISTRY}"
+ info "Using unified multi-target Dockerfile at ${DOCKERFILE}"
+ info "Target platform: ${BUILD_PLATFORM}"
+ echo ""
+
+ # Ensure buildx is available for cross-platform builds
+ if ! docker buildx version &>/dev/null; then
+ error "docker buildx is required for cross-platform builds"
+ error "Install Docker Desktop or run: docker buildx create --use"
+ exit 1
+ fi
+
+ # Check if images already exist
+ local need_server=true
+ local need_sidecar=true
+
+ if check_image_exists "${SERVER_IMAGE}"; then
+ info "Image ${SERVER_IMAGE} already exists"
+ if ! confirm "Rebuild device-api-server image?"; then
+ need_server=false
+ fi
+ fi
+
+ if check_image_exists "${SIDECAR_IMAGE}"; then
+ info "Image ${SIDECAR_IMAGE} already exists"
+ if ! confirm "Rebuild device-api-server-sidecar image?"; then
+ need_sidecar=false
+ fi
+ fi
+
+ if [[ "${need_server}" == "true" ]]; then
+ step "Build and push device-api-server image (CGO_ENABLED=0)"
+ info "This is a pure Go binary with no NVML dependencies"
+ info "Building for ${BUILD_PLATFORM} and pushing directly..."
+ run_cmd docker buildx build \
+ --platform "${BUILD_PLATFORM}" \
+ --target device-api-server \
+ -t "${SERVER_IMAGE}" \
+ -f "${DOCKERFILE}" \
+ --push \
+ .
+ pause
+ else
+ info "Skipping device-api-server build"
+ fi
+
+ if [[ "${need_sidecar}" == "true" ]]; then
+ step "Build and push device-api-server-sidecar image (CGO_ENABLED=1)"
+ info "This is the NVML provider sidecar with glibc runtime"
+ info "Building for ${BUILD_PLATFORM} and pushing directly..."
+ run_cmd docker buildx build \
+ --platform "${BUILD_PLATFORM}" \
+ --target nvml-provider \
+ -t "${SIDECAR_IMAGE}" \
+ -f "${DOCKERFILE}" \
+ --push \
+ .
+ pause
+ else
+ info "Skipping device-api-server-sidecar build"
+ fi
+}
+
+show_values_file() {
+ banner "Step 3: Review Helm Values"
+
+ info "The sidecar architecture is enabled via Helm values"
+ echo ""
+
+ step "Key configuration in ${VALUES_FILE}:"
+ echo ""
+ echo "${cyan}# Disable built-in NVML provider${reset}"
+ echo "${white}nvml:${reset}"
+ echo "${white} enabled: false${reset}"
+ echo ""
+ echo "${cyan}# Enable NVML Provider sidecar${reset}"
+ echo "${white}nvmlProvider:${reset}"
+ echo "${white} enabled: true${reset}"
+ echo "${white} image:${reset}"
+ echo "${white} repository: ${IMAGE_REGISTRY}/device-api-server-sidecar${reset}"
+ echo "${white} tag: \"${IMAGE_TAG}\"${reset}"
+ echo "${white} # Sidecar connects via shared unix socket volume${reset}"
+ echo "${white} runtimeClassName: nvidia${reset}"
+ echo ""
+
+ if [[ -f "${VALUES_FILE}" ]]; then
+ step "Full values file:"
+ run_cmd cat "${VALUES_FILE}"
+ fi
+
+ pause
+}
+
+deploy_sidecar() {
+ banner "Step 4: Deploy with Sidecar Architecture"
+
+ step "Create namespace if not exists"
+ echo "${magenta} \$ kubectl create namespace ${NAMESPACE} --dry-run=client -o yaml | kubectl apply -f -${reset}"
+ kubectl --kubeconfig="${KUBECONFIG}" create namespace "${NAMESPACE}" --dry-run=client -o yaml | \
+ kubectl --kubeconfig="${KUBECONFIG}" apply -f -
+
+ pause
+
+ # Check if release already exists
+ # Build --set overrides to ensure Helm uses the same images we just built,
+ # regardless of what the values file says.
+ IMAGE_OVERRIDES=(
+ --set "image.repository=${IMAGE_REGISTRY}/device-api-server"
+ --set "image.tag=${IMAGE_TAG}"
+ --set "nvmlProvider.image.repository=${IMAGE_REGISTRY}/device-api-server-sidecar"
+ --set "nvmlProvider.image.tag=${IMAGE_TAG}"
+ )
+
+ if helm status "${RELEASE_NAME}" --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" &>/dev/null; then
+ info "Release '${RELEASE_NAME}' already exists"
+ step "Upgrading existing release..."
+ run_cmd helm upgrade "${RELEASE_NAME}" "${CHART_PATH}" \
+ --kubeconfig="${KUBECONFIG}" \
+ --namespace "${NAMESPACE}" \
+ -f "${VALUES_FILE}" \
+ "${IMAGE_OVERRIDES[@]}"
+
+ step "Restarting pods to pick up changes..."
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" rollout restart daemonset "${RELEASE_NAME}"
+ else
+ step "Installing new release..."
+ run_cmd helm install "${RELEASE_NAME}" "${CHART_PATH}" \
+ --kubeconfig="${KUBECONFIG}" \
+ --namespace "${NAMESPACE}" \
+ -f "${VALUES_FILE}" \
+ "${IMAGE_OVERRIDES[@]}"
+ fi
+
+ pause
+
+ step "Waiting for pods to be ready (timeout 2m)..."
+ if ! kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" rollout status daemonset "${RELEASE_NAME}" --timeout=2m; then
+ warn "Rollout not complete within timeout. Checking status..."
+ fi
+
+ step "Current pod status"
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} -o wide
+
+ pause
+
+ step "Verify both containers are running in each pod"
+ info "Each pod should have 2/2 containers ready"
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} \
+ -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\t"}{range .status.containerStatuses[*]}{.name}:{.ready}{" "}{end}{"\n"}{end}'
+
+ pause
+}
+
+verify_gpu_registration() {
+ banner "Step 5: Verify GPU Registration"
+
+ step "Wait for pods to be ready"
+ info "Waiting up to 60 seconds for pods to start..."
+ if ! kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" wait --for=condition=ready pod -l app.kubernetes.io/name=${APP_NAME} --timeout=60s 2>/dev/null; then
+ warn "Pods may not be ready yet. Checking status..."
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} -o wide
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" describe pods -l app.kubernetes.io/name=${APP_NAME} | tail -30
+ error "Pods not ready. Check the output above for issues."
+ return 1
+ fi
+
+ pause
+
+ step "Verify DaemonSet coverage on all GPU nodes"
+ local gpu_nodes_ready
+ local gpu_nodes_total
+ local daemonset_desired
+ local daemonset_ready
+
+ gpu_nodes_total=$(kubectl --kubeconfig="${KUBECONFIG}" get nodes -l "${GPU_NODE_SELECTOR}" --no-headers 2>/dev/null | wc -l | tr -d ' ')
+ gpu_nodes_ready=$(kubectl --kubeconfig="${KUBECONFIG}" get nodes -l "${GPU_NODE_SELECTOR}" --no-headers 2>/dev/null | grep -c " Ready" || true)
+ # Ensure gpu_nodes_ready is a valid number (grep -c returns 0 with exit code 1 when no matches)
+ [[ -z "${gpu_nodes_ready}" ]] && gpu_nodes_ready=0
+ daemonset_desired=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get daemonset "${RELEASE_NAME}" -o jsonpath='{.status.desiredNumberScheduled}' 2>/dev/null || echo "0")
+ daemonset_ready=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get daemonset "${RELEASE_NAME}" -o jsonpath='{.status.numberReady}' 2>/dev/null || echo "0")
+
+ echo ""
+ info "GPU Nodes (total): ${gpu_nodes_total}"
+ info "GPU Nodes (Ready): ${gpu_nodes_ready}"
+ info "DaemonSet (desired): ${daemonset_desired}"
+ info "DaemonSet (ready): ${daemonset_ready}"
+ echo ""
+
+ if [[ "${daemonset_ready}" -eq "${gpu_nodes_ready}" && "${daemonset_ready}" -gt 0 ]]; then
+ echo "${green} ✓ DaemonSet running on all ${daemonset_ready} Ready GPU nodes${reset}"
+ else
+ warn "DaemonSet coverage mismatch! Expected ${gpu_nodes_ready} pods, got ${daemonset_ready}"
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get daemonset "${RELEASE_NAME}"
+ fi
+
+ pause
+
+ step "List all pods and their nodes"
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} -o wide
+
+ pause
+
+ step "Get a pod name for testing"
+ POD=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} -o jsonpath='{.items[0].metadata.name}')
+ if [[ -z "${POD}" ]]; then
+ error "No pods found. DaemonSet may not be scheduling on any nodes."
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get daemonset
+ return 1
+ fi
+ NODE=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pod "${POD}" -o jsonpath='{.spec.nodeName}')
+ info "Using pod: ${POD} (on node: ${NODE})"
+
+ pause
+
+ step "Check device-api-server logs for provider connection"
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" logs "${POD}" -c "${CONTAINER_NAME}" --tail=20 || true
+
+ pause
+
+ step "Check nvml-provider sidecar logs"
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" logs "${POD}" -c "${SIDECAR_CONTAINER_NAME}" --tail=20 || true
+
+ pause
+
+ verify_gpu_uuid_match "${POD}" "${NODE}"
+}
+
+verify_gpu_uuid_match() {
+ local pod="$1"
+ local node="$2"
+
+ banner "Step 5b: Verify GPU UUID Match"
+
+ info "Comparing GPU UUIDs from nvidia-smi with device-api-server registered GPUs"
+ info "Pod: ${pod} | Node: ${node}"
+ echo ""
+
+ step "Get GPU UUID from nvidia-smi on the node (via sidecar container)"
+ local nvidia_smi_uuids
+ nvidia_smi_uuids=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" exec "${pod}" -c "${SIDECAR_CONTAINER_NAME}" -- \
+ nvidia-smi --query-gpu=uuid --format=csv,noheader 2>/dev/null || echo "")
+
+ if [[ -z "${nvidia_smi_uuids}" ]]; then
+ warn "Could not get GPU UUIDs from nvidia-smi"
+ return 1
+ fi
+
+ echo "${cyan} nvidia-smi GPU UUIDs:${reset}"
+ echo "${nvidia_smi_uuids}" | while read -r uuid; do
+ echo " - ${uuid}"
+ done
+ echo ""
+
+ pause
+
+ step "Get registered GPU UUIDs from device-api-server logs"
+ local registered_uuids
+ registered_uuids=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" logs "${pod}" -c "${SIDECAR_CONTAINER_NAME}" 2>/dev/null | \
+ grep -o 'uuid="GPU-[^"]*"' | sed 's/uuid="//;s/"$//' | sort -u || echo "")
+
+ if [[ -z "${registered_uuids}" ]]; then
+ warn "Could not find registered GPU UUIDs in logs"
+ return 1
+ fi
+
+ echo "${cyan} Registered GPU UUIDs:${reset}"
+ echo "${registered_uuids}" | while read -r uuid; do
+ echo " - ${uuid}"
+ done
+ echo ""
+
+ pause
+
+ step "Compare UUIDs"
+ local match_count=0
+ local total_count=0
+
+ while read -r smi_uuid; do
+ [[ -z "${smi_uuid}" ]] && continue
+ total_count=$((total_count + 1))
+ if echo "${registered_uuids}" | grep -q "${smi_uuid}"; then
+ echo "${green} ✓ ${smi_uuid} - MATCHED${reset}"
+ match_count=$((match_count + 1))
+ else
+ echo "${red} ✗ ${smi_uuid} - NOT FOUND in registered GPUs${reset}"
+ fi
+ done <<< "${nvidia_smi_uuids}"
+
+ echo ""
+ if [[ "${match_count}" -eq "${total_count}" && "${total_count}" -gt 0 ]]; then
+ echo "${green} ✓ All ${total_count} GPU(s) from nvidia-smi are registered in device-api-server${reset}"
+ else
+ warn "UUID mismatch: ${match_count}/${total_count} GPUs matched"
+ fi
+
+ pause
+}
+
+demonstrate_crash_recovery() {
+ banner "Step 6: Demonstrate Crash Recovery"
+
+ info "The sidecar architecture provides crash isolation."
+ info "If the NVML provider crashes, the API server continues running"
+ info "and will reconnect when the provider restarts."
+ echo ""
+
+ step "Get current pod"
+ POD=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} -o jsonpath='{.items[0].metadata.name}')
+ info "Using pod: ${POD}"
+
+ pause
+
+ if confirm_destructive "Kill the nvml-provider container to demonstrate crash recovery?"; then
+ step "Killing nvml-provider container..."
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" exec "${POD}" -c "${SIDECAR_CONTAINER_NAME}" -- kill 1 || true
+
+ info "Waiting for container restart..."
+ sleep 5
+
+ step "Check pod status (should show restart count)"
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pod "${POD}" -o wide
+
+ step "Verify API server continued running"
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" logs "${POD}" -c "${CONTAINER_NAME}" --tail=10 || true
+
+ step "Verify provider reconnected"
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" logs "${POD}" -c "${SIDECAR_CONTAINER_NAME}" --tail=10 || true
+ else
+ info "Skipping crash recovery demonstration"
+ fi
+
+ pause
+}
+
+show_metrics() {
+ banner "Step 7: View Provider Metrics"
+
+ step "Get pod for port-forward"
+ POD=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} -o jsonpath='{.items[0].metadata.name}')
+
+ step "Fetch metrics from the API server"
+ info "Key metrics to look for:"
+ info " - device_apiserver_service_status: Whether services are serving"
+ info " - device_apiserver_build_info: Build information"
+ info " - grpc_server_*: gRPC request/stream metrics"
+ echo ""
+
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" exec "${POD}" -c "${CONTAINER_NAME}" -- \
+ wget -qO- http://localhost:9090/metrics 2>/dev/null | grep -E "^(device_apiserver_|grpc_server_handled_total)" | sort || {
+ run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" exec "${POD}" -c "${CONTAINER_NAME}" -- \
+ curl -s http://localhost:9090/metrics 2>/dev/null | grep -E "^(device_apiserver_|grpc_server_handled_total)" | sort || true
+ }
+
+ pause
+}
+
+cleanup() {
+ banner "Cleanup"
+
+ if confirm_destructive "Remove the sidecar deployment and restore default?"; then
+ step "Uninstalling Helm release..."
+ run_cmd helm uninstall "${RELEASE_NAME}" \
+ --kubeconfig="${KUBECONFIG}" \
+ --namespace "${NAMESPACE}" || true
+
+ info "Cleanup complete!"
+ else
+ info "Skipping cleanup. Release '${RELEASE_NAME}' left in namespace '${NAMESPACE}'"
+ fi
+}
+
+show_summary() {
+ banner "Demo Complete!"
+
+ echo "${white}What we demonstrated:${reset}"
+ echo "${green} ✓ Built separate images for device-api-server and device-api-server-sidecar${reset}"
+ echo "${green} ✓ Deployed as sidecar architecture via Helm${reset}"
+ echo "${green} ✓ Verified DaemonSet runs on ALL GPU nodes${reset}"
+ echo "${green} ✓ Verified GPU UUIDs match between nvidia-smi and device-api-server${reset}"
+ echo "${green} ✓ Showed crash isolation and recovery${reset}"
+ echo "${green} ✓ Explored provider metrics${reset}"
+ echo ""
+ echo "${white}Images built:${reset}"
+ echo "${cyan} - ${SERVER_IMAGE}${reset}"
+ echo "${cyan} - ${SIDECAR_IMAGE}${reset}"
+ echo ""
+ echo "${white}Key files:${reset}"
+ echo "${cyan} - ${DOCKERFILE} # Multi-target container build${reset}"
+ echo "${cyan} - ${VALUES_FILE} # Helm values for sidecar mode${reset}"
+ echo "${cyan} - ${CHART_PATH}/ # Helm chart with sidecar support${reset}"
+ echo ""
+ echo "${white}Environment variables for customization:${reset}"
+ echo "${cyan} KUBECONFIG, NAMESPACE, RELEASE_NAME, IMAGE_REGISTRY, IMAGE_TAG,${reset}"
+ echo "${cyan} SERVER_IMAGE, SIDECAR_IMAGE, BUILD_PLATFORM, GPU_NODE_SELECTOR,${reset}"
+ echo "${cyan} CHART_PATH, VALUES_FILE, DOCKERFILE${reset}"
+ echo ""
+}
+
+# ==============================================================================
+# Main
+# ==============================================================================
+
+main() {
+ export KUBECONFIG
+
+ show_intro
+ show_config
+ check_prereqs
+ show_cluster_info
+
+ if confirm "Build and push container images?"; then
+ build_images
+ else
+ info "Skipping image build. Using existing images at ${IMAGE_REGISTRY}"
+ fi
+
+ show_values_file
+
+ if confirm "Deploy the sidecar architecture to the cluster?"; then
+ deploy_sidecar
+ verify_gpu_registration
+ demonstrate_crash_recovery
+ show_metrics
+ cleanup
+ else
+ info "Skipping deployment"
+ fi
+
+ show_summary
+}
+
+# Run main if script is executed (not sourced)
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+ main "$@"
+fi
diff --git a/deployments/container/Dockerfile b/deployments/container/Dockerfile
new file mode 100644
index 000000000..d322f3a2f
--- /dev/null
+++ b/deployments/container/Dockerfile
@@ -0,0 +1,190 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Multi-target Dockerfile for NVSentinel components
+#
+# Targets:
+# device-api-server - Pure Go server (no NVML, uses sidecar provider)
+# nvml-provider - NVML provider sidecar (CGO, requires RuntimeClass nvidia)
+#
+# Build examples:
+# # Build device-api-server (default, pure Go)
+# docker build --target device-api-server -t nvsentinel/device-api-server .
+#
+# # Build nvml-provider sidecar
+# docker build --target nvml-provider -t nvsentinel/nvml-provider .
+#
+# Note: NVML provider requires glibc runtime (Debian) for RTLD_DEEPBIND support
+
+# TODO: Add Cosign image signing and SBOM generation to CI/CD pipeline.
+# See: https://docs.sigstore.dev/signing/quickstart/
+# Steps:
+# 1. Sign images with cosign: cosign sign --key
+# 2. Generate SBOM: syft -o cyclonedx-json > sbom.json
+# 3. Attach SBOM: cosign attach sbom --sbom sbom.json
+
+# ==============================================================================
+# Build Arguments
+# ==============================================================================
+
+ARG GOLANG_VERSION=1.25
+ARG VERSION=dev
+ARG GIT_COMMIT=unknown
+ARG GIT_TREE_STATE=dirty
+ARG BUILD_DATE
+
+# ==============================================================================
+# Base Builder - Pure Go (Alpine)
+# ==============================================================================
+
+FROM golang:${GOLANG_VERSION}-alpine AS builder-alpine
+
+ARG VERSION
+ARG GIT_COMMIT
+ARG GIT_TREE_STATE
+ARG BUILD_DATE
+
+WORKDIR /workspace
+
+# Install build dependencies
+RUN apk add --no-cache git make
+
+# Copy go mod files first for caching
+COPY go.mod go.sum ./
+
+# Download dependencies
+RUN go mod download
+
+# Copy source code
+COPY . .
+
+# Version package path
+ARG VERSION_PKG=github.com/nvidia/nvsentinel/pkg/version
+
+# Build device-api-server (CGO disabled, pure Go)
+RUN CGO_ENABLED=0 GOOS=linux go build \
+ -ldflags "-s -w \
+ -X ${VERSION_PKG}.Version=${VERSION} \
+ -X ${VERSION_PKG}.GitCommit=${GIT_COMMIT} \
+ -X ${VERSION_PKG}.GitTreeState=${GIT_TREE_STATE} \
+ -X ${VERSION_PKG}.BuildDate=${BUILD_DATE}" \
+ -o /build/device-api-server \
+ ./cmd/device-api-server
+
+# ==============================================================================
+# Base Builder - CGO (Debian/glibc)
+# ==============================================================================
+
+FROM golang:${GOLANG_VERSION}-bookworm AS builder-debian
+
+ARG VERSION
+ARG GIT_COMMIT
+ARG GIT_TREE_STATE
+ARG BUILD_DATE
+
+WORKDIR /workspace
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ git \
+ make \
+ && rm -rf /var/lib/apt/lists/*
+
+# Copy go mod files first for caching
+COPY go.mod go.sum ./
+
+# Download dependencies
+RUN go mod download
+
+# Copy source code
+COPY . .
+
+# Version package path
+ARG VERSION_PKG=github.com/nvidia/nvsentinel/pkg/version
+
+# Build nvml-provider (CGO enabled for go-nvml)
+RUN CGO_ENABLED=1 go build \
+ -tags=nvml \
+ -ldflags "-s -w \
+ -X ${VERSION_PKG}.Version=${VERSION} \
+ -X ${VERSION_PKG}.GitCommit=${GIT_COMMIT} \
+ -X ${VERSION_PKG}.GitTreeState=${GIT_TREE_STATE} \
+ -X ${VERSION_PKG}.BuildDate=${BUILD_DATE}" \
+ -o /build/nvml-provider \
+ ./cmd/nvml-provider
+
+# ==============================================================================
+# Target: device-api-server
+# ==============================================================================
+# Pure Go server with no NVML dependencies. Uses sidecar provider for GPU access.
+# Small image size, fast startup, works on any architecture.
+
+# Pinned to digest for reproducible builds. Update with:
+# docker manifest inspect alpine:3.21 | jq '.manifests[] | select(.platform.architecture=="amd64") | .digest'
+FROM alpine:3.21@sha256:22e0ec13c0db6b3e1ba3280e831fc50ba7bffe58e81f31670a64b1afede247bc AS device-api-server
+
+LABEL org.opencontainers.image.source="https://github.com/nvidia/nvsentinel"
+LABEL org.opencontainers.image.description="NVSentinel Device API Server - Node-local GPU device state cache"
+LABEL org.opencontainers.image.licenses="Apache-2.0"
+LABEL org.opencontainers.image.title="device-api-server"
+
+# Add ca-certificates for HTTPS
+RUN apk add --no-cache ca-certificates
+
+WORKDIR /
+
+COPY --from=builder-alpine --chmod=755 /build/device-api-server /device-api-server
+
+# Run as non-root user (nobody)
+USER 65534:65534
+
+# Health probe port (configurable via --health-probe-bind-address)
+EXPOSE 8081
+# Metrics port (configurable via --metrics-bind-address)
+EXPOSE 9090
+
+ENTRYPOINT ["/device-api-server"]
+
+# ==============================================================================
+# Target: nvml-provider
+# ==============================================================================
+# NVML provider sidecar for GPU enumeration and health monitoring.
+# Requires glibc runtime (Debian) for RTLD_DEEPBIND support.
+# Must run with RuntimeClass: nvidia to access NVML libraries.
+
+# Pinned to digest for reproducible builds. Update with:
+# docker manifest inspect debian:bookworm-slim | jq '.manifests[] | select(.platform.architecture=="amd64") | .digest'
+FROM debian:bookworm-slim@sha256:6458e6ce2b6448e31bfdced4be7d8aa88d389e6694ab09f5a718a694abe147f4 AS nvml-provider
+
+LABEL org.opencontainers.image.source="https://github.com/nvidia/nvsentinel"
+LABEL org.opencontainers.image.description="NVSentinel NVML Provider - GPU enumeration and health monitoring sidecar"
+LABEL org.opencontainers.image.licenses="Apache-2.0"
+LABEL org.opencontainers.image.title="nvml-provider"
+
+# Add ca-certificates for HTTPS
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ ca-certificates \
+ && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /
+
+COPY --from=builder-debian --chmod=755 /build/nvml-provider /nvml-provider
+
+# Run as non-root user
+USER 65534:65534
+
+# Health check port
+EXPOSE 8082
+
+ENTRYPOINT ["/nvml-provider"]
diff --git a/deployments/helm/device-api-server/Chart.yaml b/deployments/helm/device-api-server/Chart.yaml
new file mode 100644
index 000000000..10f76a543
--- /dev/null
+++ b/deployments/helm/device-api-server/Chart.yaml
@@ -0,0 +1,51 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: device-api-server
+description: |
+ Device API Server - Node-local GPU device state cache server for Kubernetes.
+
+ The Device API Server acts as an intermediary between providers (health monitors)
+ that update GPU device states and consumers (device plugins, DRA drivers) that
+ read device states for scheduling decisions.
+
+ Key features:
+ - Read-blocking semantics during provider updates
+ - Sidecar architecture for NVML isolation
+ - Multiple provider and consumer support
+ - Prometheus metrics and alerting
+ - Health-based GPU scheduling decisions
+type: application
+version: 0.1.0
+appVersion: "0.1.0"
+kubeVersion: ">=1.25.0-0"
+keywords:
+ - nvidia
+ - gpu
+ - device
+ - nvml
+ - health
+ - daemonset
+ - grpc
+home: https://github.com/nvidia/nvsentinel
+sources:
+ - https://github.com/nvidia/nvsentinel
+maintainers:
+ - name: NVIDIA
+ url: https://github.com/nvidia
+icon: https://www.nvidia.com/favicon.ico
+annotations:
+ artifacthub.io/license: Apache-2.0
+ artifacthub.io/category: monitoring-logging
diff --git a/deployments/helm/device-api-server/README.md b/deployments/helm/device-api-server/README.md
new file mode 100644
index 000000000..b8990a413
--- /dev/null
+++ b/deployments/helm/device-api-server/README.md
@@ -0,0 +1,263 @@
+# Device API Server Helm Chart
+
+Node-local GPU device state cache server for Kubernetes.
+
+## Introduction
+
+The Device API Server is a DaemonSet that runs on each GPU node, providing a local gRPC cache for GPU device states. It acts as an intermediary between:
+
+- **Providers** (health monitors) that update GPU device states
+- **Consumers** (device plugins, DRA drivers) that read device states for scheduling decisions
+
+Key features:
+
+- Read-blocking semantics during provider updates
+- Multiple provider and consumer support
+- Optional NVML fallback provider for GPU enumeration and XID monitoring
+- Prometheus metrics and alerting
+- Unix socket for node-local communication
+
+## Prerequisites
+
+- Kubernetes 1.25+
+- Helm 3.0+
+- (Optional) NVIDIA GPU Operator for NVML provider support
+- (Optional) Prometheus Operator for ServiceMonitor/PrometheusRule
+
+## Installation
+
+### Quick Start
+
+```bash
+# Add the Helm repository (when published)
+helm repo add nvsentinel https://nvidia.github.io/nvsentinel
+helm repo update
+
+# Install with default configuration
+helm install device-api-server nvsentinel/device-api-server \
+ --namespace device-api --create-namespace
+```
+
+### Install from Local Chart
+
+```bash
+helm install device-api-server ./deployments/helm/device-api-server \
+ --namespace device-api --create-namespace
+```
+
+### Install with NVML Provider
+
+To enable built-in GPU enumeration and health monitoring via NVML:
+
+```bash
+helm install device-api-server ./deployments/helm/device-api-server \
+ --namespace device-api --create-namespace \
+ --set nvmlProvider.enabled=true
+```
+
+> **Note**: NVML provider requires the `nvidia` RuntimeClass. Install the NVIDIA GPU Operator or create it manually.
+
+### Install with Prometheus Monitoring
+
+```bash
+helm install device-api-server ./deployments/helm/device-api-server \
+ --namespace device-api --create-namespace \
+ --set metrics.serviceMonitor.enabled=true \
+ --set metrics.prometheusRule.enabled=true
+```
+
+## Configuration
+
+See [values.yaml](values.yaml) for the full list of configurable parameters.
+
+### Key Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `image.repository` | Image repository | `ghcr.io/nvidia/device-api-server` |
+| `image.tag` | Image tag | Chart appVersion |
+| `server.grpcAddress` | gRPC server address | `:50051` |
+| `server.unixSocket` | Unix socket path | `/var/run/device-api/device.sock` |
+| `server.healthPort` | Health endpoint port | `8081` |
+| `server.metricsPort` | Metrics endpoint port | `9090` |
+| `nvmlProvider.enabled` | Enable NVML provider sidecar | `false` |
+| `nvmlProvider.driverRoot` | NVIDIA driver library root | `/run/nvidia/driver` |
+| `nvmlProvider.healthCheckEnabled` | Enable XID event monitoring | `true` |
+| `runtimeClassName` | Pod RuntimeClass | `""` |
+| `nodeSelector` | Node selector | `nvidia.com/gpu.present: "true"` |
+| `metrics.serviceMonitor.enabled` | Create ServiceMonitor | `false` |
+| `metrics.prometheusRule.enabled` | Create PrometheusRule | `false` |
+
+### Resource Configuration
+
+```yaml
+resources:
+ requests:
+ cpu: 50m
+ memory: 64Mi
+ limits:
+ cpu: 200m
+ memory: 256Mi
+```
+
+### NVML Provider Configuration
+
+```yaml
+nvmlProvider:
+ enabled: true
+ driverRoot: /run/nvidia/driver
+ healthCheckEnabled: true
+```
+
+Default ignored XIDs (application errors): 13, 31, 43, 45, 68, 109
+
+### Node Scheduling
+
+By default, the DaemonSet schedules only on nodes with `nvidia.com/gpu.present=true` label:
+
+```yaml
+nodeSelector:
+ nvidia.com/gpu.present: "true"
+
+tolerations:
+ - key: nvidia.com/gpu
+ operator: Exists
+ effect: NoSchedule
+```
+
+Override for custom environments:
+
+```bash
+helm install device-api-server ./deployments/helm/device-api-server \
+ --set 'nodeSelector.node-type=gpu' \
+ --set 'nodeSelector.nvidia\.com/gpu\.present=null'
+```
+
+## Metrics
+
+The server exposes Prometheus metrics at `/metrics` on the configured `metricsPort`.
+
+### Available Metrics
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `device_api_server_info` | Gauge | Server information |
+| `device_api_server_cache_gpus_total` | Gauge | Total GPUs in cache |
+| `device_api_server_cache_gpus_healthy` | Gauge | Healthy GPUs |
+| `device_api_server_cache_gpus_unhealthy` | Gauge | Unhealthy GPUs |
+| `device_api_server_cache_updates_total` | Counter | Cache update operations |
+| `device_api_server_watch_streams_active` | Gauge | Active watch streams |
+| `device_api_server_watch_events_total` | Counter | Watch events sent |
+| `device_api_server_nvml_provider_enabled` | Gauge | NVML provider status |
+| `device_api_server_nvml_gpu_count` | Gauge | GPUs discovered by NVML |
+
+### Alerting Rules
+
+When `metrics.prometheusRule.enabled=true`, the following alerts are configured:
+
+| Alert | Severity | Description |
+|-------|----------|-------------|
+| `DeviceAPIServerDown` | Critical | Server unreachable for 5m |
+| `DeviceAPIServerHighLatency` | Warning | P99 latency > 500ms |
+| `DeviceAPIServerHighErrorRate` | Warning | Error rate > 10% |
+| `DeviceAPIServerUnhealthyGPUs` | Warning | Unhealthy GPUs detected |
+| `DeviceAPIServerNoGPUs` | Warning | No GPUs registered for 10m |
+| `DeviceAPIServerNVMLProviderDown` | Warning | NVML provider not running |
+
+## Client Connection
+
+Clients on the same node can connect via:
+
+### Unix Socket (Recommended)
+
+```go
+conn, err := grpc.Dial(
+ "unix:///var/run/device-api/device.sock",
+ grpc.WithInsecure(),
+)
+```
+
+### TCP
+
+```go
+conn, err := grpc.Dial(
+ "localhost:50051",
+ grpc.WithInsecure(),
+)
+```
+
+### grpcurl Examples
+
+```bash
+# List available services
+grpcurl -plaintext localhost:50051 list
+
+# List GPUs
+grpcurl -plaintext localhost:50051 nvidia.device.v1alpha1.GpuService/ListGpus
+
+# Watch GPU changes
+grpcurl -plaintext localhost:50051 nvidia.device.v1alpha1.GpuService/WatchGpus
+```
+
+## Upgrading
+
+```bash
+helm upgrade device-api-server ./deployments/helm/device-api-server \
+ --namespace device-api \
+ --reuse-values \
+ --set image.tag=v0.2.0
+```
+
+## Uninstallation
+
+```bash
+helm uninstall device-api-server --namespace device-api
+```
+
+## Troubleshooting
+
+### Pod Not Scheduling
+
+Check node labels:
+
+```bash
+kubectl get nodes --show-labels | grep gpu
+```
+
+Ensure nodes have `nvidia.com/gpu.present=true` or override `nodeSelector`.
+
+### NVML Provider Fails to Start
+
+1. Verify RuntimeClass exists:
+
+ ```bash
+ kubectl get runtimeclass nvidia
+ ```
+
+2. Check NVIDIA driver is installed on nodes:
+
+ ```bash
+ kubectl debug node/ -it --image=nvidia/cuda:12.0-base -- nvidia-smi
+ ```
+
+3. Check pod logs for NVML errors:
+
+ ```bash
+ kubectl logs -n device-api -l app.kubernetes.io/name=device-api-server
+ ```
+
+### Permission Denied on Unix Socket
+
+If using custom security contexts, ensure the socket directory is writable:
+
+```yaml
+securityContext:
+ runAsUser: 0 # May be needed for hostPath access
+ runAsNonRoot: false
+```
+
+## License
+
+Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+Licensed under the Apache License, Version 2.0.
diff --git a/deployments/helm/device-api-server/chart_test.go b/deployments/helm/device-api-server/chart_test.go
new file mode 100644
index 000000000..cc5a42864
--- /dev/null
+++ b/deployments/helm/device-api-server/chart_test.go
@@ -0,0 +1,181 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package chart_test
+
+import (
+ "os"
+ "os/exec"
+ "strings"
+ "testing"
+)
+
+// chartDir returns the path to the Helm chart directory.
+func chartDir(t *testing.T) string {
+ t.Helper()
+ // When running from the chart directory itself
+ if _, err := os.Stat("Chart.yaml"); err == nil {
+ wd, _ := os.Getwd()
+ return wd
+ }
+ t.Fatal("Chart.yaml not found; run tests from the chart directory")
+ return ""
+}
+
+// helmTemplate runs helm template with optional --set overrides and returns stdout.
+func helmTemplate(t *testing.T, sets ...string) string {
+ t.Helper()
+ args := []string{"template", "test-release", chartDir(t)}
+ for _, s := range sets {
+ args = append(args, "--set", s)
+ }
+ cmd := exec.Command("helm", args...)
+ out, err := cmd.CombinedOutput()
+ if err != nil {
+ t.Fatalf("helm template failed: %v\n%s", err, string(out))
+ }
+ return string(out)
+}
+
+func TestChart_DefaultRenders(t *testing.T) {
+ out := helmTemplate(t)
+ if len(out) == 0 {
+ t.Fatal("helm template produced no output")
+ }
+ // Should contain a DaemonSet
+ if !strings.Contains(out, "kind: DaemonSet") {
+ t.Error("Expected DaemonSet in rendered output")
+ }
+ // Should contain a ServiceAccount
+ if !strings.Contains(out, "kind: ServiceAccount") {
+ t.Error("Expected ServiceAccount in rendered output")
+ }
+}
+
+func TestChart_TerminationGracePeriod_Default(t *testing.T) {
+ out := helmTemplate(t)
+ // Default: shutdownDelay(5) + shutdownGracePeriod(25) + 5 = 35
+ if !strings.Contains(out, "terminationGracePeriodSeconds: 35") {
+ t.Errorf("Expected terminationGracePeriodSeconds: 35 with defaults, got:\n%s",
+ extractLine(out, "terminationGracePeriodSeconds"))
+ }
+}
+
+func TestChart_TerminationGracePeriod_CustomValues(t *testing.T) {
+ out := helmTemplate(t,
+ "server.shutdownDelay=10",
+ "server.shutdownGracePeriod=60",
+ )
+ // 10 + 60 + 5 = 75
+ if !strings.Contains(out, "terminationGracePeriodSeconds: 75") {
+ t.Errorf("Expected terminationGracePeriodSeconds: 75 with custom values, got:\n%s",
+ extractLine(out, "terminationGracePeriodSeconds"))
+ }
+}
+
+func TestChart_NoNVMLSidecar_ByDefault(t *testing.T) {
+ out := helmTemplate(t)
+ if strings.Contains(out, "name: nvml-provider") {
+ t.Error("NVML provider sidecar should not be present by default")
+ }
+}
+
+func TestChart_NVMLSidecar_WhenEnabled(t *testing.T) {
+ out := helmTemplate(t, "nvmlProvider.enabled=true")
+ if !strings.Contains(out, "name: nvml-provider") {
+ t.Error("NVML provider sidecar should be present when enabled")
+ }
+ // Should have NVIDIA_VISIBLE_DEVICES env var
+ if !strings.Contains(out, "NVIDIA_VISIBLE_DEVICES") {
+ t.Error("Expected NVIDIA_VISIBLE_DEVICES env var in nvml-provider sidecar")
+ }
+}
+
+func TestChart_BindAddress(t *testing.T) {
+ out := helmTemplate(t)
+ // Default binds to unix socket
+ if !strings.Contains(out, "--bind-address=unix:///var/run/device-api/device.sock") {
+ t.Error("Expected default --bind-address=unix:///var/run/device-api/device.sock")
+ }
+}
+
+func TestChart_SecurityContext(t *testing.T) {
+ out := helmTemplate(t)
+ if !strings.Contains(out, "readOnlyRootFilesystem: true") {
+ t.Error("Expected readOnlyRootFilesystem: true in security context")
+ }
+ if !strings.Contains(out, "runAsNonRoot: true") {
+ t.Error("Expected runAsNonRoot: true in security context")
+ }
+ if !strings.Contains(out, "allowPrivilegeEscalation: false") {
+ t.Error("Expected allowPrivilegeEscalation: false in security context")
+ }
+}
+
+func TestChart_SocketVolume(t *testing.T) {
+ out := helmTemplate(t)
+ if !strings.Contains(out, "name: socket-dir") {
+ t.Error("Expected socket-dir volume")
+ }
+ if !strings.Contains(out, "/var/run/device-api") {
+ t.Error("Expected socket directory path /var/run/device-api")
+ }
+}
+
+func TestChart_MetricsPort_WhenEnabled(t *testing.T) {
+ out := helmTemplate(t, "metrics.enabled=true")
+ if !strings.Contains(out, "name: metrics") {
+ t.Error("Expected metrics port when metrics are enabled")
+ }
+}
+
+func TestChart_MetricsPort_WhenDisabled(t *testing.T) {
+ out := helmTemplate(t, "metrics.enabled=false")
+ // The metrics port should not appear in containerPort definitions
+ lines := strings.Split(out, "\n")
+ for i, line := range lines {
+ if strings.Contains(line, "name: metrics") &&
+ i > 0 && strings.Contains(lines[i-1], "containerPort") {
+ t.Error("Metrics port should not be present when metrics are disabled")
+ }
+ }
+}
+
+func TestChart_NodeSelector(t *testing.T) {
+ out := helmTemplate(t)
+ if !strings.Contains(out, "nvidia.com/gpu.present") {
+ t.Error("Expected GPU node selector by default")
+ }
+}
+
+func TestChart_PreStopHook(t *testing.T) {
+ out := helmTemplate(t)
+ // preStop sleep should match shutdownDelay
+ if !strings.Contains(out, `command: ["sleep", "5"]`) {
+ // Try alternate format
+ if !strings.Contains(out, "sleep") {
+ t.Error("Expected preStop sleep hook")
+ }
+ }
+}
+
+// extractLine returns the first line containing the given substring.
+func extractLine(s, substr string) string {
+ for _, line := range strings.Split(s, "\n") {
+ if strings.Contains(line, substr) {
+ return strings.TrimSpace(line)
+ }
+ }
+ return ""
+}
diff --git a/deployments/helm/device-api-server/templates/NOTES.txt b/deployments/helm/device-api-server/templates/NOTES.txt
new file mode 100644
index 000000000..bf22b58ef
--- /dev/null
+++ b/deployments/helm/device-api-server/templates/NOTES.txt
@@ -0,0 +1,126 @@
+{{/*
+Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/}}
+===============================================================================
+ NVIDIA Device API Server has been installed!
+===============================================================================
+
+Release: {{ .Release.Name }}
+Namespace: {{ .Release.Namespace }}
+Chart Version: {{ .Chart.Version }}
+App Version: {{ .Chart.AppVersion }}
+
+-------------------------------------------------------------------------------
+ Configuration Summary
+-------------------------------------------------------------------------------
+
+Unix Socket: {{ .Values.server.unixSocket }}
+Health Port: {{ .Values.server.healthPort }}
+Metrics Port: {{ .Values.server.metricsPort }}
+{{- if .Values.nvmlProvider.enabled }}
+NVML Provider Sidecar: Enabled
+ - Connects via: unix://{{ .Values.server.unixSocket }}
+ - Driver Root: {{ .Values.nvmlProvider.driverRoot }}
+ - Health Check: {{ .Values.nvmlProvider.healthCheckEnabled }}
+{{- else }}
+NVML Provider Sidecar: Disabled
+{{- end }}
+
+-------------------------------------------------------------------------------
+ Verify Installation
+-------------------------------------------------------------------------------
+
+1. Check that DaemonSet pods are running on GPU nodes:
+
+ kubectl get pods -n {{ .Release.Namespace }} -l app.kubernetes.io/instance={{ .Release.Name }} -o wide
+
+2. Check pod logs:
+
+ kubectl logs -n {{ .Release.Namespace }} -l app.kubernetes.io/instance={{ .Release.Name }} -f
+
+3. Verify health endpoint (from within the cluster):
+
+ kubectl run -n {{ .Release.Namespace }} --rm -it --restart=Never --image=curlimages/curl:latest curl -- \
+ curl -s http://{{ include "device-api-server.fullname" . }}-metrics.{{ .Release.Namespace }}.svc:{{ .Values.server.metricsPort }}/metrics | head -20
+
+{{- if .Values.metrics.enabled }}
+
+-------------------------------------------------------------------------------
+ Metrics & Monitoring
+-------------------------------------------------------------------------------
+
+Metrics endpoint: http://:{{ .Values.server.metricsPort }}/metrics
+
+{{- if .Values.metrics.serviceMonitor.enabled }}
+ServiceMonitor: Enabled (Prometheus will auto-discover)
+{{- else }}
+ServiceMonitor: Disabled
+ To enable Prometheus auto-discovery, upgrade with:
+ --set metrics.serviceMonitor.enabled=true
+{{- end }}
+
+{{- if .Values.metrics.prometheusRule.enabled }}
+PrometheusRule: Enabled (alerts configured)
+{{- else }}
+PrometheusRule: Disabled
+ To enable alerting rules, upgrade with:
+ --set metrics.prometheusRule.enabled=true
+{{- end }}
+{{- end }}
+
+-------------------------------------------------------------------------------
+ Client Connection
+-------------------------------------------------------------------------------
+
+Providers and consumers on the same node can connect via:
+
+ - Unix Socket: unix://{{ .Values.server.unixSocket }}
+
+Example using grpcurl:
+
+ # List available services (via health/admin port)
+ grpcurl -plaintext localhost:{{ .Values.server.healthPort }} list
+
+ # List GPUs (via unix socket, requires grpcurl with unix support)
+ grpcurl -plaintext -unix {{ .Values.server.unixSocket }} \
+ nvidia.device.v1alpha1.GpuService/ListGpus
+
+{{- if .Values.nvmlProvider.enabled }}
+
+-------------------------------------------------------------------------------
+ NVML Provider Sidecar Notes
+-------------------------------------------------------------------------------
+
+The NVML provider sidecar requires:
+ 1. RuntimeClass "nvidia" must exist in the cluster
+ 2. NVIDIA GPU Operator or Container Toolkit installed
+ 3. Nodes must have NVIDIA GPUs
+
+Verify RuntimeClass exists:
+ kubectl get runtimeclass nvidia
+
+If not present, create it or install the NVIDIA GPU Operator:
+ https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/
+
+{{- end }}
+
+-------------------------------------------------------------------------------
+ Support
+-------------------------------------------------------------------------------
+
+Documentation: https://github.com/nvidia/nvsentinel
+Issues: https://github.com/nvidia/nvsentinel/issues
+
+===============================================================================
diff --git a/deployments/helm/device-api-server/templates/_helpers.tpl b/deployments/helm/device-api-server/templates/_helpers.tpl
new file mode 100644
index 000000000..8771b2ec9
--- /dev/null
+++ b/deployments/helm/device-api-server/templates/_helpers.tpl
@@ -0,0 +1,95 @@
+{{/*
+Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/}}
+
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "device-api-server.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "device-api-server.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "device-api-server.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "device-api-server.labels" -}}
+helm.sh/chart: {{ include "device-api-server.chart" . }}
+{{ include "device-api-server.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+app.kubernetes.io/part-of: device-api
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "device-api-server.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "device-api-server.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+app.kubernetes.io/component: device-api-server
+{{- end }}
+
+{{/*
+Create the name of the service account to use
+*/}}
+{{- define "device-api-server.serviceAccountName" -}}
+{{- if .Values.serviceAccount.create }}
+{{- default (include "device-api-server.fullname" .) .Values.serviceAccount.name }}
+{{- else }}
+{{- default "default" .Values.serviceAccount.name }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create the image name
+*/}}
+{{- define "device-api-server.image" -}}
+{{- $tag := default .Chart.AppVersion .Values.image.tag -}}
+{{- printf "%s:%s" .Values.image.repository $tag }}
+{{- end }}
+
+{{/*
+Socket directory path
+*/}}
+{{- define "device-api-server.socketDir" -}}
+{{- .Values.server.unixSocket | dir }}
+{{- end }}
diff --git a/deployments/helm/device-api-server/templates/daemonset.yaml b/deployments/helm/device-api-server/templates/daemonset.yaml
new file mode 100644
index 000000000..7143ddb5f
--- /dev/null
+++ b/deployments/helm/device-api-server/templates/daemonset.yaml
@@ -0,0 +1,222 @@
+{{/*
+Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/}}
+{{- if not (or (hasPrefix "/var/run/" .Values.server.unixSocket) (hasPrefix "/tmp/" .Values.server.unixSocket)) }}
+{{- fail "server.unixSocket must be an absolute path under /var/run/ or /tmp/" }}
+{{- end }}
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: {{ include "device-api-server.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ {{- include "device-api-server.labels" . | nindent 4 }}
+spec:
+ selector:
+ matchLabels:
+ {{- include "device-api-server.selectorLabels" . | nindent 6 }}
+ updateStrategy:
+ {{- toYaml .Values.updateStrategy | nindent 4 }}
+ template:
+ metadata:
+ annotations:
+ {{- with .Values.podAnnotations }}
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ labels:
+ {{- include "device-api-server.labels" . | nindent 8 }}
+ {{- with .Values.podLabels }}
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ spec:
+ {{- with .Values.imagePullSecrets }}
+ imagePullSecrets:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ serviceAccountName: {{ include "device-api-server.serviceAccountName" . }}
+ automountServiceAccountToken: {{ .Values.serviceAccount.automountServiceAccountToken }}
+ {{- with .Values.podSecurityContext }}
+ securityContext:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- with .Values.priorityClassName }}
+ priorityClassName: {{ . }}
+ {{- end }}
+ {{- with .Values.runtimeClassName }}
+ runtimeClassName: {{ . }}
+ {{- end }}
+ initContainers:
+ # Set restrictive permissions on the socket directory
+ - name: init-socket-dir
+ image: {{ include "device-api-server.image" . }}
+ command: ["sh", "-c", "mkdir -p {{ include "device-api-server.socketDir" . }} && chmod 0750 {{ include "device-api-server.socketDir" . }}"]
+ securityContext:
+ runAsUser: 0
+ allowPrivilegeEscalation: false
+ capabilities:
+ drop:
+ - ALL
+ volumeMounts:
+ - name: socket-dir
+ mountPath: {{ include "device-api-server.socketDir" . }}
+ {{- with .Values.initContainers }}
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ containers:
+ - name: {{ .Chart.Name }}
+ image: {{ include "device-api-server.image" . }}
+ imagePullPolicy: {{ .Values.image.pullPolicy }}
+ args:
+ - --bind-address=unix://{{ .Values.server.unixSocket }}
+ - --health-probe-bind-address=:{{ .Values.server.healthPort }}
+ - --metrics-bind-address=:{{ .Values.server.metricsPort }}
+ - --shutdown-grace-period={{ .Values.server.shutdownGracePeriod }}s
+ - -v={{ .Values.logging.verbosity }}
+ env:
+ - name: NODE_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: spec.nodeName
+ - name: POD_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.name
+ - name: POD_NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.namespace
+ {{- with .Values.env }}
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
+ lifecycle:
+ preStop:
+ exec:
+ # Sleep to allow k8s to propagate endpoint removal
+ command: ["sleep", "{{ .Values.server.shutdownDelay }}"]
+ ports:
+ - name: health
+ containerPort: {{ .Values.server.healthPort }}
+ protocol: TCP
+ {{- if .Values.metrics.enabled }}
+ - name: metrics
+ containerPort: {{ .Values.server.metricsPort }}
+ protocol: TCP
+ {{- end }}
+ # Health probes use the TCP admin port (gRPC health service).
+ # The server's health monitor checks both storage readiness and
+ # service readiness before reporting SERVING, so a passing probe
+ # implies the device socket is functional. K8s does not support
+ # Unix domain socket probes natively.
+ {{- with .Values.livenessProbe }}
+ livenessProbe:
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
+ {{- with .Values.readinessProbe }}
+ readinessProbe:
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
+ {{- with .Values.securityContext }}
+ securityContext:
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
+ {{- with .Values.resources }}
+ resources:
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
+ volumeMounts:
+ - name: socket-dir
+ mountPath: {{ include "device-api-server.socketDir" . }}
+ {{- with .Values.extraVolumeMounts }}
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
+ {{- if .Values.nvmlProvider.enabled }}
+ # NVML Provider sidecar container
+ - name: nvml-provider
+ image: "{{ .Values.nvmlProvider.image.repository }}:{{ .Values.nvmlProvider.image.tag | default .Chart.AppVersion }}"
+ imagePullPolicy: {{ .Values.nvmlProvider.image.pullPolicy }}
+ args:
+ - --server-address=unix://{{ .Values.server.unixSocket }}
+ - --provider-id={{ .Values.nvmlProvider.providerID }}
+ - --driver-root={{ .Values.nvmlProvider.driverRoot }}
+ - --health-check={{ .Values.nvmlProvider.healthCheckEnabled }}
+ - --health-port={{ .Values.nvmlProvider.healthPort }}
+ env:
+ - name: NODE_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: spec.nodeName
+ # NVIDIA Container Toolkit environment variables
+ - name: NVIDIA_VISIBLE_DEVICES
+ value: "all"
+ - name: NVIDIA_DRIVER_CAPABILITIES
+ value: "utility"
+ ports:
+ - name: provider-health
+ containerPort: {{ .Values.nvmlProvider.healthPort }}
+ protocol: TCP
+ livenessProbe:
+ httpGet:
+ path: /healthz
+ port: provider-health
+ initialDelaySeconds: 10
+ periodSeconds: 10
+ timeoutSeconds: 5
+ failureThreshold: 3
+ readinessProbe:
+ httpGet:
+ path: /readyz
+ port: provider-health
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ timeoutSeconds: 5
+ failureThreshold: 3
+ {{- with .Values.nvmlProvider.securityContext }}
+ securityContext:
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
+ {{- with .Values.nvmlProvider.resources }}
+ resources:
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
+ volumeMounts:
+ - name: socket-dir
+ mountPath: {{ include "device-api-server.socketDir" . }}
+ {{- end }}
+ {{- with .Values.sidecars }}
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ volumes:
+ - name: socket-dir
+ hostPath:
+ path: {{ include "device-api-server.socketDir" . }}
+ type: DirectoryOrCreate
+ {{- with .Values.extraVolumes }}
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- with .Values.nodeSelector }}
+ nodeSelector:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- with .Values.affinity }}
+ affinity:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- with .Values.tolerations }}
+ tolerations:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ # terminationGracePeriodSeconds = preStop sleep + shutdown grace period + buffer
+ # preStop and SIGTERM run concurrently in k8s, so we use one shutdownDelay, not two.
+ terminationGracePeriodSeconds: {{ add .Values.server.shutdownDelay .Values.server.shutdownGracePeriod 5 }}
diff --git a/deployments/helm/device-api-server/templates/prometheusrule.yaml b/deployments/helm/device-api-server/templates/prometheusrule.yaml
new file mode 100644
index 000000000..3a82faca6
--- /dev/null
+++ b/deployments/helm/device-api-server/templates/prometheusrule.yaml
@@ -0,0 +1,93 @@
+{{/*
+Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/}}
+{{- if and .Values.metrics.enabled .Values.metrics.prometheusRule.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ include "device-api-server.fullname" . }}
+ namespace: {{ .Values.metrics.prometheusRule.namespace | default .Release.Namespace }}
+ labels:
+ {{- include "device-api-server.labels" . | nindent 4 }}
+ {{- with .Values.metrics.prometheusRule.labels }}
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+spec:
+ groups:
+ - name: device-api-server
+ rules:
+ # Server availability
+ - alert: DeviceAPIServerDown
+ expr: up{job="{{ include "device-api-server.fullname" . }}-metrics"} == 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Device API Server is down on {{ "{{ $labels.instance }}" }}"
+ description: "Device API Server has been unreachable for more than 5 minutes."
+ runbook_url: "https://github.com/nvidia/device-api/blob/main/docs/operations/device-api-server.md#alert-deviceapiserverdown"
+
+ # High latency
+ - alert: DeviceAPIServerHighLatency
+ expr: |
+ histogram_quantile(0.99,
+ sum(rate(grpc_server_handling_seconds_bucket{
+ grpc_service="nvidia.device.v1alpha1.GpuService"
+ }[5m])) by (le, instance)
+ ) > 0.5
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Device API Server high latency on {{ "{{ $labels.instance }}" }}"
+ description: "P99 latency is above 500ms for more than 5 minutes."
+ runbook_url: "https://github.com/nvidia/device-api/blob/main/docs/operations/device-api-server.md#alert-deviceapiserverhighlatency"
+
+ # High error rate
+ - alert: DeviceAPIServerHighErrorRate
+ expr: |
+ sum(rate(grpc_server_handled_total{
+ grpc_code!="OK",
+ grpc_service=~"nvidia.device.v1alpha1.*"
+ }[5m])) by (instance)
+ /
+ sum(rate(grpc_server_handled_total{
+ grpc_service=~"nvidia.device.v1alpha1.*"
+ }[5m])) by (instance)
+ > 0.1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Device API Server high error rate on {{ "{{ $labels.instance }}" }}"
+ description: "Error rate is above 10% for more than 5 minutes."
+ runbook_url: "https://github.com/nvidia/device-api/blob/main/docs/operations/device-api-server.md#alert-deviceapiserverhigherrorrate"
+
+ # High memory usage
+ - alert: DeviceAPIServerHighMemory
+ expr: |
+ process_resident_memory_bytes{job="{{ include "device-api-server.fullname" . }}-metrics"} > 512 * 1024 * 1024
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Device API Server high memory usage on {{ "{{ $labels.instance }}" }}"
+ description: "Memory usage is above 512MB for more than 10 minutes."
+ runbook_url: "https://github.com/nvidia/device-api/blob/main/docs/operations/device-api-server.md#alert-deviceapiserverhighmemory"
+
+ {{- with .Values.metrics.prometheusRule.additionalRules }}
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+{{- end }}
diff --git a/deployments/helm/device-api-server/templates/service.yaml b/deployments/helm/device-api-server/templates/service.yaml
new file mode 100644
index 000000000..64ee33c40
--- /dev/null
+++ b/deployments/helm/device-api-server/templates/service.yaml
@@ -0,0 +1,37 @@
+{{/*
+Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/}}
+{{- if .Values.metrics.enabled }}
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ include "device-api-server.fullname" . }}-metrics
+ namespace: {{ .Release.Namespace }}
+ labels:
+ {{- include "device-api-server.labels" . | nindent 4 }}
+ {{- with .Values.service.annotations }}
+ annotations:
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+spec:
+ type: {{ .Values.service.type }}
+ ports:
+ - port: {{ .Values.server.metricsPort }}
+ targetPort: metrics
+ protocol: TCP
+ name: metrics
+ selector:
+ {{- include "device-api-server.selectorLabels" . | nindent 4 }}
+{{- end }}
diff --git a/deployments/helm/device-api-server/templates/serviceaccount.yaml b/deployments/helm/device-api-server/templates/serviceaccount.yaml
new file mode 100644
index 000000000..e4c0a6091
--- /dev/null
+++ b/deployments/helm/device-api-server/templates/serviceaccount.yaml
@@ -0,0 +1,29 @@
+{{/*
+Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/}}
+{{- if .Values.serviceAccount.create }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: {{ include "device-api-server.serviceAccountName" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ {{- include "device-api-server.labels" . | nindent 4 }}
+ {{- with .Values.serviceAccount.annotations }}
+ annotations:
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+automountServiceAccountToken: {{ .Values.serviceAccount.automountServiceAccountToken }}
+{{- end }}
diff --git a/deployments/helm/device-api-server/templates/servicemonitor.yaml b/deployments/helm/device-api-server/templates/servicemonitor.yaml
new file mode 100644
index 000000000..cb378ae22
--- /dev/null
+++ b/deployments/helm/device-api-server/templates/servicemonitor.yaml
@@ -0,0 +1,47 @@
+{{/*
+Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/}}
+{{- if and .Values.metrics.enabled .Values.metrics.serviceMonitor.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+ name: {{ include "device-api-server.fullname" . }}
+ namespace: {{ .Values.metrics.serviceMonitor.namespace | default .Release.Namespace }}
+ labels:
+ {{- include "device-api-server.labels" . | nindent 4 }}
+ {{- with .Values.metrics.serviceMonitor.labels }}
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+spec:
+ selector:
+ matchLabels:
+ {{- include "device-api-server.selectorLabels" . | nindent 6 }}
+ namespaceSelector:
+ matchNames:
+ - {{ .Release.Namespace }}
+ endpoints:
+ - port: metrics
+ interval: {{ .Values.metrics.serviceMonitor.interval }}
+ scrapeTimeout: {{ .Values.metrics.serviceMonitor.scrapeTimeout }}
+ path: /metrics
+ {{- with .Values.metrics.serviceMonitor.metricRelabelings }}
+ metricRelabelings:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- with .Values.metrics.serviceMonitor.relabelings }}
+ relabelings:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+{{- end }}
diff --git a/deployments/helm/device-api-server/values.yaml b/deployments/helm/device-api-server/values.yaml
new file mode 100644
index 000000000..9c9dbb907
--- /dev/null
+++ b/deployments/helm/device-api-server/values.yaml
@@ -0,0 +1,255 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Default values for device-api-server.
+# This is a YAML-formatted file.
+
+# -- Number of replicas (ignored for DaemonSet, kept for consistency)
+replicaCount: 1
+
+# -- Image configuration
+image:
+ # -- Image repository
+ repository: ghcr.io/nvidia/device-api-server
+ # -- Image pull policy
+ pullPolicy: IfNotPresent
+ # -- Image tag (defaults to Chart appVersion)
+ tag: ""
+
+# -- Image pull secrets
+imagePullSecrets: []
+
+# -- Override the name of the chart
+nameOverride: ""
+
+# -- Override the full name of the chart
+fullnameOverride: ""
+
+# -- Server configuration
+server:
+ # -- Unix socket path for gRPC API (device service)
+ # Must be an absolute path under /var/run/ or /tmp/.
+ # WARNING: Arbitrary paths may expose host filesystem risks.
+ unixSocket: /var/run/device-api/device.sock
+ # -- HTTP port for health/admin gRPC endpoints
+ healthPort: 8081
+ # -- HTTP port for Prometheus metrics
+ metricsPort: 9090
+ # -- Graceful shutdown grace period in seconds
+ shutdownGracePeriod: 25
+ # -- Shutdown delay in seconds (preStop sleep for k8s endpoint propagation)
+ shutdownDelay: 5
+
+# -- Logging configuration
+logging:
+ # -- Log verbosity level (0=info, higher=more verbose)
+ verbosity: 0
+
+# -- NVML Provider Sidecar configuration
+# Deploys the NVML provider as a sidecar container that connects to device-api-server
+# via gRPC. This provides better isolation and independent updates compared to the
+# built-in nvml provider.
+nvmlProvider:
+ # -- Enable the NVML provider sidecar container
+ enabled: false
+ # -- Image configuration for the nvml-provider sidecar
+ image:
+ # -- Image repository
+ repository: ghcr.io/nvidia/device-api-server
+ # -- Image tag (defaults to Chart appVersion)
+ tag: ""
+ # -- Image pull policy
+ pullPolicy: IfNotPresent
+ # -- gRPC address of the device-api-server (derived from server.unixSocket in daemonset template)
+ # Sidecar connects via shared unix socket volume.
+ # This value is ignored when the sidecar is enabled; the template uses server.unixSocket directly.
+ # -- Unique identifier for this provider instance
+ providerID: "nvml-provider-sidecar"
+ # -- Root path where NVIDIA driver libraries are located
+ driverRoot: /run/nvidia/driver
+ # -- Enable XID event monitoring for health checks
+ healthCheckEnabled: true
+ # -- HTTP port for health check endpoints
+ healthPort: 8082
+ # -- Resource limits and requests for the sidecar
+ resources:
+ requests:
+ cpu: 50m
+ memory: 64Mi
+ limits:
+ cpu: 200m
+ memory: 128Mi
+ # -- Security context for the sidecar container
+ securityContext:
+ runAsNonRoot: true
+ runAsUser: 65534
+ runAsGroup: 65534
+ readOnlyRootFilesystem: true
+ allowPrivilegeEscalation: false
+ capabilities:
+ drop:
+ - ALL
+
+# -- RuntimeClassName for the pod
+# Set to "nvidia" when nvml.enabled is true to inject NVIDIA driver libraries
+# Requires the NVIDIA GPU Operator or manual RuntimeClass configuration
+runtimeClassName: ""
+
+# -- ServiceAccount configuration
+serviceAccount:
+ # -- Create a ServiceAccount
+ create: true
+ # -- ServiceAccount name (generated if not set)
+ name: ""
+ # -- Annotations to add to the ServiceAccount
+ annotations: {}
+ # -- Automount service account token
+ automountServiceAccountToken: false
+
+# -- RBAC configuration
+rbac:
+ # -- Create RBAC resources
+ create: true
+
+# -- Pod annotations
+podAnnotations: {}
+
+# -- Pod labels
+podLabels: {}
+
+# -- Pod security context
+podSecurityContext:
+ runAsNonRoot: true
+ seccompProfile:
+ type: RuntimeDefault
+
+# -- Container security context
+securityContext:
+ runAsNonRoot: true
+ runAsUser: 65534
+ runAsGroup: 65534
+ readOnlyRootFilesystem: true
+ allowPrivilegeEscalation: false
+ capabilities:
+ drop:
+ - ALL
+
+# -- Resource limits and requests
+# Default limits handle the common 8-GPU case. For larger nodes, increase:
+# - 8 GPUs: 500m CPU, 512Mi memory (default)
+# - 16 GPUs: 1000m CPU, 1Gi memory
+# Memory usage scales with: GPU count * watch event size * watcher count
+resources:
+ requests:
+ cpu: 100m
+ memory: 128Mi
+ limits:
+ cpu: 500m
+ memory: 512Mi
+
+# -- Node selector for scheduling
+# @default -- Schedules only on GPU nodes
+nodeSelector:
+ nvidia.com/gpu.present: "true"
+
+# -- Tolerations for scheduling
+tolerations:
+ - key: nvidia.com/gpu
+ operator: Exists
+ effect: NoSchedule
+
+# -- Affinity rules
+affinity: {}
+
+# -- Priority class name
+priorityClassName: ""
+
+# -- Liveness probe configuration (gRPC health check on admin server)
+livenessProbe:
+ grpc:
+ port: 8081
+ initialDelaySeconds: 5
+ periodSeconds: 10
+ timeoutSeconds: 5
+ failureThreshold: 3
+
+# -- Readiness probe configuration (gRPC health check on admin server)
+readinessProbe:
+ grpc:
+ port: 8081
+ initialDelaySeconds: 5
+ periodSeconds: 10
+ timeoutSeconds: 5
+ failureThreshold: 3
+
+# -- Update strategy for the DaemonSet
+updateStrategy:
+ type: RollingUpdate
+ rollingUpdate:
+ maxUnavailable: 1
+
+# -- Service configuration (for metrics scraping)
+service:
+ # -- Service type
+ type: ClusterIP
+ # -- Service annotations
+ annotations: {}
+
+# -- Prometheus metrics configuration
+metrics:
+ # -- Enable metrics endpoint
+ enabled: true
+ # -- ServiceMonitor configuration (requires Prometheus Operator)
+ serviceMonitor:
+ # -- Create ServiceMonitor resource
+ enabled: false
+ # -- ServiceMonitor namespace (defaults to release namespace)
+ namespace: ""
+ # -- Additional labels for ServiceMonitor
+ labels: {}
+ # -- Scrape interval
+ interval: 30s
+ # -- Scrape timeout
+ scrapeTimeout: 10s
+ # -- Metric relabeling configs
+ metricRelabelings: []
+ # -- Relabeling configs
+ relabelings: []
+ # -- PrometheusRule configuration (requires Prometheus Operator)
+ prometheusRule:
+ # -- Create PrometheusRule resource
+ enabled: false
+ # -- PrometheusRule namespace (defaults to release namespace)
+ namespace: ""
+ # -- Additional labels for PrometheusRule
+ labels: {}
+ # -- Additional alerting rules
+ additionalRules: []
+
+# -- Additional environment variables
+env: []
+# - name: LOG_FORMAT
+# value: json
+
+# -- Additional volume mounts
+extraVolumeMounts: []
+
+# -- Additional volumes
+extraVolumes: []
+
+# -- Init containers
+initContainers: []
+
+# -- Sidecar containers
+sidecars: []
diff --git a/deployments/helm/values-sidecar-test.yaml b/deployments/helm/values-sidecar-test.yaml
new file mode 100644
index 000000000..970b54d78
--- /dev/null
+++ b/deployments/helm/values-sidecar-test.yaml
@@ -0,0 +1,54 @@
+# Sidecar test values - validates nvml-provider sidecar architecture
+# Usage: helm upgrade device-api-server deployments/helm/device-api-server -n device-api -f deployments/helm/values-sidecar-test.yaml
+
+image:
+ repository: ttl.sh/device-api-server
+ tag: "2h"
+ pullPolicy: Always
+
+# Disable built-in NVML provider (use sidecar instead)
+nvml:
+ enabled: false
+
+# Enable NVML Provider sidecar
+nvmlProvider:
+ enabled: true
+ image:
+ repository: ttl.sh/device-api-server-sidecar
+ tag: "2h"
+ pullPolicy: Always
+ providerID: "nvml-provider-sidecar"
+ driverRoot: /run/nvidia/driver
+ healthCheckEnabled: true
+ healthPort: 8082
+ resources:
+ requests:
+ cpu: 50m
+ memory: 64Mi
+ limits:
+ cpu: 200m
+ memory: 128Mi
+
+# Override node selector (cluster uses node-type=gpu instead of nvidia.com/gpu.present)
+# Set to null to remove the default, then add only the one we need
+nodeSelector:
+ nvidia.com/gpu.present: null
+ node-type: gpu
+
+# RuntimeClass for NVML access
+runtimeClassName: nvidia
+
+logging:
+ verbosity: 2
+
+# Run as root to allow hostPath socket creation
+podSecurityContext:
+ runAsNonRoot: false
+ runAsUser: 0
+ runAsGroup: 0
+ fsGroup: 0
+
+securityContext:
+ runAsNonRoot: false
+ runAsUser: 0
+ runAsGroup: 0
diff --git a/deployments/static/nvsentinel-daemonset.yaml b/deployments/static/nvsentinel-daemonset.yaml
new file mode 100644
index 000000000..beb6c8d87
--- /dev/null
+++ b/deployments/static/nvsentinel-daemonset.yaml
@@ -0,0 +1,217 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# WARNING: These static manifests use placeholder image tags (v0.0.0).
+# For production deployments, use the Helm chart with explicit image versions
+# or replace v0.0.0 with a specific release tag (e.g., v1.0.0).
+
+# NVSentinel Static Deployment Manifest
+#
+# This manifest deploys the Device API Server with the NVML Provider sidecar.
+# For production use, consider using the Helm chart for better configurability.
+#
+# Usage:
+# kubectl apply -f nvsentinel-daemonset.yaml
+#
+# Prerequisites:
+# - Kubernetes 1.25+
+# - RuntimeClass 'nvidia' configured (GPU Operator or manual setup)
+# - GPU nodes labeled with 'nvidia.com/gpu.present=true'
+
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: nvsentinel
+ labels:
+ app.kubernetes.io/name: nvsentinel
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: nvsentinel
+ namespace: nvsentinel
+ labels:
+ app.kubernetes.io/name: nvsentinel
+automountServiceAccountToken: false
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: nvsentinel
+ namespace: nvsentinel
+ labels:
+ app.kubernetes.io/name: nvsentinel
+spec:
+ type: ClusterIP
+ clusterIP: None # Headless for DaemonSet
+ selector:
+ app.kubernetes.io/name: nvsentinel
+ ports:
+ - name: health
+ port: 8081
+ targetPort: health
+ protocol: TCP
+ - name: metrics
+ port: 9090
+ targetPort: metrics
+ protocol: TCP
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: nvsentinel
+ namespace: nvsentinel
+ labels:
+ app.kubernetes.io/name: nvsentinel
+ app.kubernetes.io/component: device-api-server
+spec:
+ selector:
+ matchLabels:
+ app.kubernetes.io/name: nvsentinel
+ updateStrategy:
+ type: RollingUpdate
+ rollingUpdate:
+ maxUnavailable: 1
+ template:
+ metadata:
+ labels:
+ app.kubernetes.io/name: nvsentinel
+ app.kubernetes.io/component: device-api-server
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/port: "9090"
+ prometheus.io/path: "/metrics"
+ spec:
+ serviceAccountName: nvsentinel
+ # runtimeClassName: nvidia enables the NVIDIA Container Runtime,
+ # required for the nvml-provider sidecar to access GPU devices.
+ # This requires RuntimeClass 'nvidia' configured in the cluster
+ # (via NVIDIA GPU Operator or manual setup).
+ # See: https://kubernetes.io/docs/concepts/containers/runtime-class/
+ runtimeClassName: nvidia
+ nodeSelector:
+ nvidia.com/gpu.present: "true"
+ tolerations:
+ - key: nvidia.com/gpu
+ operator: Exists
+ effect: NoSchedule
+ securityContext:
+ runAsNonRoot: true
+ seccompProfile:
+ type: RuntimeDefault
+ containers:
+ # Device API Server - Pure Go, no NVML dependencies
+ - name: device-api-server
+ image: ghcr.io/nvidia/device-api-server:v0.0.0 # Replace with specific version for production
+ imagePullPolicy: IfNotPresent
+ args:
+ - --bind-address=unix:///var/run/device-api/device.sock
+ - --health-probe-bind-address=:8081
+ - --metrics-bind-address=:9090
+ - --shutdown-grace-period=25s
+ - -v=0
+ ports:
+ - name: health
+ containerPort: 8081
+ protocol: TCP
+ - name: metrics
+ containerPort: 9090
+ protocol: TCP
+ livenessProbe:
+ httpGet:
+ path: /healthz
+ port: health
+ initialDelaySeconds: 5
+ periodSeconds: 10
+ timeoutSeconds: 5
+ failureThreshold: 3
+ readinessProbe:
+ httpGet:
+ path: /readyz
+ port: health
+ initialDelaySeconds: 5
+ periodSeconds: 10
+ timeoutSeconds: 5
+ failureThreshold: 3
+ resources:
+ requests:
+ cpu: 50m
+ memory: 64Mi
+ limits:
+ cpu: 200m
+ memory: 256Mi
+ securityContext:
+ runAsNonRoot: true
+ runAsUser: 65534
+ runAsGroup: 65534
+ readOnlyRootFilesystem: true
+ allowPrivilegeEscalation: false
+ capabilities:
+ drop:
+ - ALL
+ volumeMounts:
+ - name: device-api-socket
+ mountPath: /var/run/device-api
+
+ # NVML Provider Sidecar - CGO binary, requires RuntimeClass nvidia
+ - name: nvml-provider
+ image: ghcr.io/nvidia/device-api-server:nvml-provider-v0.0.0 # Replace with specific version for production
+ imagePullPolicy: IfNotPresent
+ args:
+ - --server-address=unix:///var/run/device-api/device.sock
+ - --provider-id=nvml-provider
+ - --driver-root=/run/nvidia/driver
+ - --health-port=8082
+ - --health-check=true
+ - -v=0
+ ports:
+ - name: provider-health
+ containerPort: 8082
+ protocol: TCP
+ livenessProbe:
+ httpGet:
+ path: /healthz
+ port: provider-health
+ initialDelaySeconds: 5
+ periodSeconds: 10
+ timeoutSeconds: 5
+ failureThreshold: 3
+ readinessProbe:
+ httpGet:
+ path: /readyz
+ port: provider-health
+ initialDelaySeconds: 5
+ periodSeconds: 10
+ timeoutSeconds: 5
+ failureThreshold: 3
+ resources:
+ requests:
+ cpu: 50m
+ memory: 64Mi
+ limits:
+ cpu: 200m
+ memory: 128Mi
+ securityContext:
+ runAsNonRoot: true
+ runAsUser: 65534
+ runAsGroup: 65534
+ readOnlyRootFilesystem: true
+ allowPrivilegeEscalation: false
+ capabilities:
+ drop:
+ - ALL
+ volumes:
+ - name: device-api-socket
+ emptyDir: {}
diff --git a/docs/api/device-api-server.md b/docs/api/device-api-server.md
new file mode 100644
index 000000000..22b0c6ee9
--- /dev/null
+++ b/docs/api/device-api-server.md
@@ -0,0 +1,425 @@
+# Device API Server - API Reference
+
+This document provides the complete API reference for the Device API Server gRPC services.
+
+## Overview
+
+The Device API Server exposes a unified `GpuService` that provides both read and write operations following Kubernetes API conventions:
+
+| Operation Type | Methods | Clients |
+|----------------|---------|---------|
+| Read | `GetGpu`, `ListGpus`, `WatchGpus` | Consumers (device plugins, DRA drivers) |
+| Write | `CreateGpu`, `UpdateGpu`, `UpdateGpuStatus`, `DeleteGpu` | Providers (health monitors, NVML) |
+
+**Package**: `nvidia.device.v1alpha1`
+
+**Connection Endpoints**:
+- Unix Socket: `unix:///var/run/device-api/device.sock` (recommended)
+- TCP: `localhost:50051`
+
+## GpuService
+
+The `GpuService` provides a unified API for GPU resource management:
+
+- **Read operations** (`GetGpu`, `ListGpus`, `WatchGpus`) for consumers
+- **Write operations** (`CreateGpu`, `UpdateGpu`, `UpdateGpuStatus`, `DeleteGpu`) for providers
+
+> **Important**: Write operations acquire exclusive locks, blocking all consumer reads until completion. This prevents consumers from reading stale "healthy" states during GPU health transitions.
+
+### Read Operations
+
+### GetGpu
+
+Retrieves a single GPU resource by its unique name.
+
+```protobuf
+rpc GetGpu(GetGpuRequest) returns (GetGpuResponse);
+```
+
+**Request**:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `name` | string | The unique resource name of the GPU |
+
+**Response**:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `gpu` | Gpu | The requested GPU resource |
+
+**Errors**:
+- `NOT_FOUND`: GPU with the specified name does not exist
+
+**Example**:
+
+```bash
+grpcurl -plaintext localhost:50051 \
+ -d '{"name": "gpu-abc123"}' \
+ nvidia.device.v1alpha1.GpuService/GetGpu
+```
+
+### ListGpus
+
+Retrieves a list of all GPU resources.
+
+```protobuf
+rpc ListGpus(ListGpusRequest) returns (ListGpusResponse);
+```
+
+**Request**: Empty (reserved for future filtering/pagination)
+
+**Response**:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `gpu_list` | GpuList | List of all GPU resources |
+
+**Example**:
+
+```bash
+grpcurl -plaintext localhost:50051 \
+ nvidia.device.v1alpha1.GpuService/ListGpus
+```
+
+**Response Example**:
+
+```json
+{
+ "gpuList": {
+ "items": [
+ {
+ "name": "gpu-abc123",
+ "spec": {
+ "uuid": "GPU-a1b2c3d4-e5f6-a7b8-c9d0-e1f2a3b4c5d6"
+ },
+ "status": {
+ "conditions": [
+ {
+ "type": "Ready",
+ "status": "True",
+ "lastTransitionTime": "2026-01-21T10:00:00Z",
+ "reason": "GPUHealthy",
+ "message": "GPU is healthy and available"
+ }
+ ]
+ },
+ "resourceVersion": "42"
+ }
+ ]
+ }
+}
+```
+
+### WatchGpus
+
+Streams lifecycle events for GPU resources. The stream remains open until the client disconnects or an error occurs.
+
+```protobuf
+rpc WatchGpus(WatchGpusRequest) returns (stream WatchGpusResponse);
+```
+
+**Request**: Empty (reserved for future filtering/resumption)
+
+**Response Stream**:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `type` | string | Event type: `ADDED`, `MODIFIED`, `DELETED`, `ERROR` |
+| `object` | Gpu | The GPU resource (last known state for DELETED) |
+
+**Event Types**:
+
+| Type | Description |
+|------|-------------|
+| `ADDED` | GPU was registered or first observed |
+| `MODIFIED` | GPU status was updated |
+| `DELETED` | GPU was unregistered |
+| `ERROR` | An error occurred in the watch stream |
+
+**Example**:
+
+```bash
+grpcurl -plaintext localhost:50051 \
+ nvidia.device.v1alpha1.GpuService/WatchGpus
+```
+
+**Behavior**:
+- On connection, receives `ADDED` events for all existing GPUs
+- Subsequent events reflect real-time changes
+- Stream is per-client; multiple clients can watch simultaneously
+
+### Write Operations
+
+#### CreateGpu
+
+Creates a new GPU resource. This is the standard way for providers to register GPUs.
+
+```protobuf
+rpc CreateGpu(CreateGpuRequest) returns (CreateGpuResponse);
+```
+
+**Request**:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `gpu` | Gpu | The GPU to create (metadata.name and spec.uuid required) |
+
+**Response**:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `gpu` | Gpu | The created GPU with server-assigned fields |
+| `created` | bool | True if new GPU was created, false if already existed |
+
+**Errors**:
+- `INVALID_ARGUMENT`: Required fields missing
+
+**Behavior**:
+- If GPU already exists, returns existing GPU (idempotent)
+- Triggers `ADDED` event for active watch streams
+
+**Example**:
+
+```bash
+grpcurl -plaintext localhost:50051 \
+ -d '{
+ "gpu": {
+ "metadata": {"name": "gpu-abc123"},
+ "spec": {"uuid": "GPU-a1b2c3d4-e5f6-a7b8-c9d0-e1f2a3b4c5d6"}
+ }
+ }' \
+ nvidia.device.v1alpha1.GpuService/CreateGpu
+```
+
+#### UpdateGpu
+
+Replaces an entire GPU resource (spec and status).
+
+```protobuf
+rpc UpdateGpu(UpdateGpuRequest) returns (Gpu);
+```
+
+**Request**:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `gpu` | Gpu | The GPU to update (metadata.name required) |
+
+**Response**: The updated GPU resource.
+
+**Errors**:
+- `NOT_FOUND`: GPU does not exist
+- `ABORTED`: Resource version conflict (optimistic concurrency)
+
+**Behavior**:
+- Uses optimistic concurrency via `resource_version`
+- Triggers `MODIFIED` event for active watch streams
+
+#### UpdateGpuStatus
+
+Updates only the status of an existing GPU (follows Kubernetes subresource pattern).
+
+```protobuf
+rpc UpdateGpuStatus(UpdateGpuStatusRequest) returns (Gpu);
+```
+
+**Request**:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `name` | string | The GPU name to update |
+| `status` | GpuStatus | New status (completely replaces existing) |
+| `resource_version` | int64 | Optional: expected version for conflict detection |
+
+**Response**: The updated GPU resource.
+
+**Errors**:
+- `NOT_FOUND`: GPU does not exist
+- `ABORTED`: Resource version conflict (optimistic concurrency)
+
+**Locking**: Acquires exclusive write lock, blocking all reads.
+
+**Example** (mark GPU unhealthy due to XID error):
+
+```bash
+grpcurl -plaintext localhost:50051 \
+ -d '{
+ "name": "gpu-abc123",
+ "status": {
+ "conditions": [{
+ "type": "Ready",
+ "status": "False",
+ "reason": "XidError",
+ "message": "Critical XID error 79 detected"
+ }]
+ }
+ }' \
+ nvidia.device.v1alpha1.GpuService/UpdateGpuStatus
+```
+
+#### DeleteGpu
+
+Removes a GPU from the server.
+
+```protobuf
+rpc DeleteGpu(DeleteGpuRequest) returns (google.protobuf.Empty);
+```
+
+**Request**:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `name` | string | Unique identifier of GPU to remove |
+
+**Response**: Empty on success.
+
+**Errors**:
+- `NOT_FOUND`: GPU does not exist
+
+**Behavior**:
+- GPU will no longer appear in ListGpus/GetGpu responses
+- Triggers `DELETED` event for active watch streams
+
+**Example**:
+
+```bash
+grpcurl -plaintext localhost:50051 \
+ -d '{"name": "gpu-abc123"}' \
+ nvidia.device.v1alpha1.GpuService/DeleteGpu
+```
+
+---
+
+## Resource Types
+
+### Gpu
+
+The main GPU resource following the Kubernetes Resource Model pattern.
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `name` | string | Unique logical identifier |
+| `spec` | GpuSpec | Identity and desired attributes |
+| `status` | GpuStatus | Most recently observed state |
+| `resource_version` | int64 | Monotonically increasing version |
+
+### GpuSpec
+
+Defines the identity of a GPU.
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `uuid` | string | Physical hardware UUID (e.g., `GPU-a1b2c3d4-...`) |
+
+### GpuStatus
+
+Contains the observed state of a GPU.
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `conditions` | Condition[] | Current state observations |
+| `recommended_action` | string | Suggested resolution for negative states |
+
+### Condition
+
+Describes one aspect of the GPU's current state.
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `type` | string | Category (e.g., `Ready`, `MemoryHealthy`) |
+| `status` | string | `True`, `False`, or `Unknown` |
+| `last_transition_time` | Timestamp | When status last changed |
+| `reason` | string | Machine-readable reason (UpperCamelCase) |
+| `message` | string | Human-readable details |
+
+**Standard Condition Types**:
+
+| Type | Description |
+|------|-------------|
+| `Ready` | Overall GPU health and availability |
+| `MemoryHealthy` | GPU memory is functioning correctly |
+| `ThermalHealthy` | GPU temperature is within safe limits |
+
+---
+
+## Go Client Example
+
+```go
+package main
+
+import (
+ "context"
+ "log"
+
+ v1alpha1 "github.com/nvidia/nvsentinel/api/gen/go/device/v1alpha1"
+ "google.golang.org/grpc"
+ "google.golang.org/grpc/credentials/insecure"
+)
+
+func main() {
+ // Connect via Unix socket (recommended)
+ conn, err := grpc.NewClient(
+ "unix:///var/run/device-api/device.sock",
+ grpc.WithTransportCredentials(insecure.NewCredentials()),
+ )
+ if err != nil {
+ log.Fatalf("failed to connect: %v", err)
+ }
+ defer conn.Close()
+
+ client := v1alpha1.NewGpuServiceClient(conn)
+
+ // Consumer: List GPUs
+ resp, err := client.ListGpus(context.Background(), &v1alpha1.ListGpusRequest{})
+ if err != nil {
+ log.Fatalf("failed to list GPUs: %v", err)
+ }
+
+ for _, gpu := range resp.GpuList.Items {
+ log.Printf("GPU: %s, Version: %d", gpu.Metadata.Name, gpu.Metadata.ResourceVersion)
+ for _, cond := range gpu.Status.Conditions {
+ log.Printf(" Condition: %s=%s (%s)", cond.Type, cond.Status, cond.Reason)
+ }
+ }
+
+ // Provider: Update GPU status
+ _, err = client.UpdateGpuStatus(context.Background(),
+ &v1alpha1.UpdateGpuStatusRequest{
+ Gpu: &v1alpha1.Gpu{
+ Metadata: &v1alpha1.ObjectMeta{Name: "gpu-abc123"},
+ Status: &v1alpha1.GpuStatus{
+ Conditions: []*v1alpha1.Condition{{
+ Type: "Ready",
+ Status: "False",
+ Reason: "XidError",
+ Message: "Critical XID 79 detected",
+ }},
+ },
+ },
+ })
+ if err != nil {
+ log.Fatalf("failed to update status: %v", err)
+ }
+}
+```
+
+---
+
+## Error Codes
+
+| Code | Meaning |
+|------|---------|
+| `NOT_FOUND` | GPU with specified name does not exist |
+| `INVALID_ARGUMENT` | Request contains invalid parameters |
+| `ABORTED` | Resource version conflict (optimistic concurrency) |
+| `INTERNAL` | Server-side error occurred |
+| `UNAVAILABLE` | Server is temporarily unavailable |
+
+---
+
+## See Also
+
+- [Operations Guide](../operations/device-api-server.md)
+- [Design Document](../design/device-api-server.md)
+- [NVML Fallback Provider](../design/nvml-fallback-provider.md)
diff --git a/docs/design/device-api-server.md b/docs/design/device-api-server.md
new file mode 100644
index 000000000..89f159241
--- /dev/null
+++ b/docs/design/device-api-server.md
@@ -0,0 +1,695 @@
+# Device API Server - Design & Implementation Plan
+
+> **Status**: Draft
+> **Author**: NVSentinel Team
+> **Created**: 2026-01-21
+
+## Table of Contents
+
+- [Executive Summary](#executive-summary)
+- [Architecture Overview](#architecture-overview)
+- [Design Decisions](#design-decisions)
+- [Implementation Phases](#implementation-phases)
+- [Directory Structure](#directory-structure)
+- [API Design](#api-design)
+- [Observability](#observability)
+- [Deployment](#deployment)
+
+## Related Documents
+
+- [Implementation Tasks](./device-api-server-tasks.md) - Detailed task breakdown
+- [NVML Fallback Provider](./nvml-fallback-provider.md) - Built-in NVML health provider design
+
+---
+
+## Executive Summary
+
+The Device API Server is a **node-local gRPC cache server** deployed as a Kubernetes DaemonSet. It acts as an intermediary between:
+
+- **Providers** (e.g., NVSentinel health monitors) that update GPU device states
+- **Consumers** (e.g., Device Plugins, DRA Drivers) that read device states for scheduling decisions
+
+### Key Requirements
+
+| Requirement | Description |
+|-------------|-------------|
+| Node-local | DaemonSet running on each GPU node |
+| Read-blocking semantics | MUST block reads during provider updates to prevent stale data |
+| Multiple providers | Support multiple health monitors updating different conditions |
+| Multiple consumers | Support multiple readers (device-plugin, DRA driver, etc.) |
+| Kubernetes patterns | klog/v2, structured logging, health probes |
+| Helm-only deployment | No kustomize, pure Helm chart |
+| Observability | Prometheus metrics, alerting rules |
+
+---
+
+## Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│ Kubernetes Node │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│ │
+│ ┌──────────────────────┐ ┌──────────────────────────────┐ │
+│ │ NVSentinel │ │ Device Plugin / DRA │ │
+│ │ (Health Monitor) │ │ Driver │ │
+│ │ [Provider] │ │ [Consumer] │ │
+│ └──────────┬───────────┘ └──────────────┬───────────────┘ │
+│ │ │ │
+│ │ UpdateGpuStatus() │ GetGpu() │
+│ │ (gRPC) │ ListGpus() │
+│ │ │ WatchGpus() │
+│ ▼ ▼ │
+│ ┌──────────────────────────────────────────────────────────────────────────┐ │
+│ │ Device API Server (DaemonSet) │ │
+│ │ ┌────────────────────────────────────────────────────────────────────┐ │ │
+│ │ │ gRPC Server │ │ │
+│ │ │ ┌────────────────────────────────────────────────────────────┐ │ │ │
+│ │ │ │ GpuService (Unified) │ │ │ │
+│ │ │ │ Write: CreateGpu, UpdateGpu, UpdateGpuStatus, DeleteGpu │ │ │ │
+│ │ │ │ Read: GetGpu, ListGpus, WatchGpus │ │ │ │
+│ │ │ └────────────────────────────────┬───────────────────────────┘ │ │ │
+│ │ │ │ │ │ │
+│ │ │ ▼ │ │ │
+│ │ │ ┌─────────────────────────────────────────────────────────────┐ │ │ │
+│ │ │ │ Cache Layer │ │ │ │
+│ │ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ │
+│ │ │ │ │ sync.RWMutex (Writer-Preference) │ │ │ │ │
+│ │ │ │ │ │ │ │ │ │
+│ │ │ │ │ Write Lock() ──────────► Blocks ALL new RLock() │ │ │ │ │
+│ │ │ │ │ until write completes │ │ │ │ │
+│ │ │ │ │ │ │ │ │ │
+│ │ │ │ │ This ensures consumers NEVER read stale data when │ │ │ │ │
+│ │ │ │ │ a provider is updating (healthy → unhealthy) │ │ │ │ │
+│ │ │ │ └───────────────────────────────────────────────────────┘ │ │ │ │
+│ │ │ │ │ │ │ │
+│ │ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ │
+│ │ │ │ │ map[string]*Gpu (In-Memory Store) │ │ │ │ │
+│ │ │ │ └───────────────────────────────────────────────────────┘ │ │ │ │
+│ │ │ └─────────────────────────────────────────────────────────────┘ │ │ │
+│ │ │ │ │ │
+│ │ │ ┌─────────────────────────────────────────────────────────────┐ │ │ │
+│ │ │ │ Watch Broadcaster │ │ │ │
+│ │ │ │ Notifies all WatchGpus() streams on state changes │ │ │ │
+│ │ │ └─────────────────────────────────────────────────────────────┘ │ │ │
+│ │ └────────────────────────────────────────────────────────────────────┘ │ │
+│ │ │ │
+│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────────────┐ │ │
+│ │ │ Health │ │ Metrics │ │ Unix Socket │ │ │
+│ │ │ :8081 │ │ :9090 │ │ /var/run/device-api/device.sock │ │ │
+│ │ │ /healthz │ │ /metrics │ │ (node-local gRPC) │ │ │
+│ │ │ /readyz │ │ │ │ │ │ │
+│ │ └─────────────┘ └─────────────┘ └─────────────────────────────────┘ │ │
+│ └──────────────────────────────────────────────────────────────────────────┘ │
+│ │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+### Data Flow: Read-Blocking Semantics
+
+```
+Timeline ──────────────────────────────────────────────────────────────────────────►
+
+Provider (NVSentinel) Cache (RWMutex) Consumer (Device Plugin)
+ │ │ │
+ │ │◄──── RLock() ────────────────┤ GetGpu()
+ │ │ (allowed) │
+ │ │──────────────────────────────►│ Returns data
+ │ │ RUnlock() │
+ │ │ │
+ │──── UpdateGpuStatus() ──────►│ │
+ │ Lock() requested │ │
+ │ │ │
+ │ │◄──── RLock() ────────────────┤ GetGpu()
+ │ │ BLOCKED ⛔ │ (waits)
+ │ │ │
+ │◄──── Lock() acquired ────────│ │
+ │ (write in progress) │ │
+ │ │ │
+ │──── Update complete ────────►│ │
+ │ Unlock() │ │
+ │ │ │
+ │ │──── RLock() allowed ─────────►│
+ │ │ (fresh data) │
+ │ │ │
+
+⚠️ CRITICAL: Consumer NEVER reads stale "healthy" state when provider
+ is updating to "unhealthy". The RWMutex writer-preference ensures
+ new readers block once a write is pending.
+```
+
+---
+
+## Design Decisions
+
+### D1: Read-Blocking vs Eventually Consistent
+
+| Option | Pros | Cons | Decision |
+|--------|------|------|----------|
+| **sync.RWMutex (writer-preference)** | Prevents stale reads; simple; Go-native | Readers blocked during writes | ✅ **Selected** |
+| atomic.Value + copy-on-write | Never blocks readers | Readers may see stale data during update | ❌ Rejected |
+| sync.Map | Good for read-heavy | No blocking semantics; may read stale | ❌ Rejected |
+
+**Rationale**: The requirement explicitly states "MUST block reads, preventing false positives when a node 'was' healthy, and the next state is unhealthy." This mandates write-blocking reads.
+
+### D2: Transport Protocol
+
+| Option | Pros | Cons | Decision |
+|--------|------|------|----------|
+| **Unix Socket** | Node-local only; no network exposure; fast | Pod must mount socket path | ✅ **Primary** |
+| TCP localhost | Easy client setup | Requires port allocation | ✅ **Secondary** |
+| hostNetwork + TCP | Accessible from host | Security risk | ❌ Rejected |
+
+**Rationale**: Unix socket provides security isolation and performance for node-local communication. TCP fallback for flexibility.
+
+### D3: Provider Registration Model
+
+| Option | Pros | Cons | Decision |
+|--------|------|------|----------|
+| **Implicit (any caller can update)** | Simple; stateless server | No provider identity tracking | ✅ **Phase 1** |
+| Explicit registration | Track providers; detect failures | More complexity | 🔮 **Phase 2** |
+
+### D4: Logging Framework
+
+| Option | Pros | Cons | Decision |
+|--------|------|------|----------|
+| **klog/v2** | Kubernetes native; contextual logging; JSON format | Slightly verbose API | ✅ **Selected** |
+| zap | Fast; popular | Not Kubernetes native | ❌ Rejected |
+| logr | Interface-based | Needs backend anyway | Used via klog |
+
+---
+
+## Implementation Phases
+
+### Phase 1: Core Server Foundation
+
+**Goal**: Minimal viable gRPC server with cache and blocking semantics.
+
+| Task ID | Task | Description | Estimate |
+|---------|------|-------------|----------|
+| P1.1 | Project scaffolding | Create `cmd/device-api-server/`, `internal/` structure | S |
+| P1.2 | Proto extensions | Add provider-side RPCs (UpdateGpuStatus, RegisterGpu, UnregisterGpu) | M |
+| P1.3 | Cache implementation | Thread-safe cache with RWMutex, writer-preference blocking | M |
+| P1.4 | Consumer gRPC service | Implement GetGpu, ListGpus, WatchGpus (read path) | M |
+| P1.5 | Provider gRPC service | Implement UpdateGpuStatus, RegisterGpu, UnregisterGpu (write path) | M |
+| P1.6 | Watch broadcaster | Fan-out changes to all active WatchGpus streams | M |
+| P1.7 | Graceful shutdown | SIGTERM handling, drain connections, health status | S |
+| P1.8 | Unit tests | Cache tests, service tests, blocking behavior tests | L |
+
+**Deliverables**:
+- Working gRPC server binary
+- Consumer and Provider services
+- Basic health endpoint
+
+---
+
+### Phase 2: Kubernetes Integration
+
+**Goal**: Production-ready DaemonSet with proper k8s integration.
+
+| Task ID | Task | Description | Estimate |
+|---------|------|-------------|----------|
+| P2.1 | klog/v2 integration | Structured logging, contextual loggers, log levels | M |
+| P2.2 | Health probes | gRPC health protocol, HTTP /healthz /readyz endpoints | M |
+| P2.3 | Configuration | Flags, environment variables, config validation | S |
+| P2.4 | Unix socket support | Listen on configurable socket path | S |
+| P2.5 | Signal handling | Proper SIGTERM/SIGINT handling per k8s lifecycle | S |
+| P2.6 | Integration tests | Test with mock providers/consumers | L |
+
+**Deliverables**:
+- Kubernetes-ready binary
+- Health endpoints
+- Configurable via flags/env
+
+---
+
+### Phase 3: Observability
+
+**Goal**: Full observability stack with metrics and alerts.
+
+| Task ID | Task | Description | Estimate |
+|---------|------|-------------|----------|
+| P3.1 | Prometheus metrics | Request counts, latencies, cache stats, connection counts | M |
+| P3.2 | gRPC interceptors | grpc-prometheus interceptors for all RPCs | M |
+| P3.3 | Custom metrics | `device_api_server_gpus_total`, `_unhealthy`, `_cache_*` | M |
+| P3.4 | Metrics endpoint | HTTP /metrics on separate port | S |
+| P3.5 | Alerting rules | PrometheusRule CRD for critical alerts | M |
+| P3.6 | Grafana dashboard | JSON dashboard for visualization | M |
+
+**Metrics to implement**:
+
+```
+# Server metrics
+device_api_server_info{version="...", go_version="..."}
+device_api_server_up
+
+# Cache metrics
+device_api_server_cache_gpus_total
+device_api_server_cache_gpus_healthy
+device_api_server_cache_gpus_unhealthy
+device_api_server_cache_updates_total{provider="..."}
+device_api_server_cache_lock_wait_seconds_bucket
+
+# gRPC metrics (via interceptor)
+grpc_server_started_total{grpc_service, grpc_method}
+grpc_server_handled_total{grpc_service, grpc_method, grpc_code}
+grpc_server_handling_seconds_bucket{grpc_service, grpc_method}
+
+# Watch metrics
+device_api_server_watch_streams_active
+device_api_server_watch_events_total{type="ADDED|MODIFIED|DELETED"}
+```
+
+**Alerts**:
+
+```yaml
+- alert: DeviceAPIServerDown
+ expr: up{job="device-api-server"} == 0
+ for: 5m
+
+- alert: DeviceAPIServerHighLatency
+ expr: histogram_quantile(0.99, grpc_server_handling_seconds_bucket) > 0.5
+ for: 5m
+
+- alert: DeviceAPIServerUnhealthyGPUs
+ expr: device_api_server_cache_gpus_unhealthy > 0
+ for: 1m
+```
+
+---
+
+### Phase 4: Helm Chart
+
+**Goal**: Production-ready Helm chart with all configurations.
+
+| Task ID | Task | Description | Estimate |
+|---------|------|-------------|----------|
+| P4.1 | Chart scaffolding | `charts/device-api-server/` structure | S |
+| P4.2 | DaemonSet template | Node selector, tolerations, resource limits | M |
+| P4.3 | RBAC templates | ServiceAccount, Role, RoleBinding | M |
+| P4.4 | ConfigMap/Secret | Server configuration, TLS certs | M |
+| P4.5 | Service templates | Headless service, metrics service | S |
+| P4.6 | PrometheusRule | Alerting rules as k8s resource | M |
+| P4.7 | ServiceMonitor | Prometheus scrape configuration | S |
+| P4.8 | Values schema | JSON schema for values validation | M |
+| P4.9 | Chart tests | Helm test hooks | M |
+| P4.10 | Documentation | README, NOTES.txt, examples | M |
+
+**Chart Structure**:
+
+```
+charts/device-api-server/
+├── Chart.yaml
+├── values.yaml
+├── values.schema.json
+├── README.md
+├── templates/
+│ ├── _helpers.tpl
+│ ├── daemonset.yaml
+│ ├── serviceaccount.yaml
+│ ├── role.yaml
+│ ├── rolebinding.yaml
+│ ├── configmap.yaml
+│ ├── service.yaml
+│ ├── service-metrics.yaml
+│ ├── servicemonitor.yaml
+│ ├── prometheusrule.yaml
+│ ├── poddisruptionbudget.yaml
+│ └── NOTES.txt
+└── tests/
+ └── test-connection.yaml
+```
+
+---
+
+### Phase 5: Documentation & Polish
+
+**Goal**: Comprehensive documentation and production hardening.
+
+| Task ID | Task | Description | Estimate |
+|---------|------|-------------|----------|
+| P5.1 | Architecture docs | Design document, diagrams | M |
+| P5.2 | API reference | Proto documentation, examples | M |
+| P5.3 | Operations guide | Deployment, troubleshooting, runbooks | L |
+| P5.4 | Developer guide | Contributing, local development | M |
+| P5.5 | Security hardening | TLS, authentication review | M |
+| P5.6 | Performance testing | Benchmark under load | L |
+| P5.7 | CI/CD pipeline | GitHub Actions for build, test, release | M |
+
+---
+
+## Directory Structure
+
+Following the [kubernetes-sigs/node-feature-discovery](https://github.com/kubernetes-sigs/node-feature-discovery) pattern
+where the `api/` is a standalone module and `pkg/` contains public library code:
+
+```
+NVSentinel/
+├── api/ # STANDALONE API MODULE (own go.mod)
+│ ├── gen/go/device/v1alpha1/ # Generated Go code
+│ │ ├── gpu.pb.go
+│ │ └── gpu_grpc.pb.go
+│ ├── proto/device/v1alpha1/ # Proto definitions
+│ │ └── gpu.proto # Unified GpuService (CRUD operations)
+│ ├── go.mod # module github.com/nvidia/nvsentinel/api
+│ ├── go.sum
+│ └── Makefile
+├── cmd/ # Command entry points (thin)
+│ └── device-api-server/
+│ └── main.go # Server entrypoint only
+├── pkg/ # PUBLIC LIBRARY CODE (importable)
+│ ├── deviceapiserver/ # Device API Server implementation
+│ │ ├── cache/ # Thread-safe GPU cache
+│ │ │ ├── cache.go
+│ │ │ ├── cache_test.go
+│ │ │ └── broadcaster.go
+│ │ ├── service/ # gRPC service implementation
+│ │ │ └── gpu_service.go # GpuService (unified read/write)
+│ │ ├── nvml/ # NVML provider (uses gRPC client)
+│ │ │ ├── provider.go
+│ │ │ ├── enumerator.go
+│ │ │ └── health_monitor.go
+│ │ ├── metrics/ # Prometheus metrics
+│ │ └── health/ # Health check handlers
+│ ├── version/ # Version information
+│ │ └── version.go
+│ └── signals/ # Signal handling utilities
+├── charts/ # Helm charts
+│ └── device-api-server/
+│ ├── Chart.yaml
+│ ├── values.yaml
+│ └── templates/
+├── docs/
+│ ├── design/
+│ ├── api/
+│ └── operations/
+├── hack/ # Build/development scripts
+├── test/ # E2E tests
+├── go.mod # Root module with replace directive
+├── go.sum
+└── Makefile
+```
+
+**Key Layout Decisions:**
+
+| Directory | Purpose | Importable |
+|-----------|---------|------------|
+| `api/` | Standalone API module for versioning | Yes (own module) |
+| `pkg/` | Public library code | Yes |
+| `cmd/` | Thin entry points | No |
+| `charts/` | Helm deployment | N/A |
+
+Root `go.mod` uses: `replace github.com/nvidia/nvsentinel/api => ./api`
+
+---
+
+## API Design
+
+### Unified GpuService
+
+Following Kubernetes API conventions, the API is consolidated into a single `GpuService` with standard CRUD methods:
+
+```protobuf
+// GpuService provides a unified API for managing GPU resources.
+//
+// Read operations (Get, List, Watch) are intended for consumers.
+// Write operations (Create, Update, UpdateStatus, Delete) are intended for providers.
+service GpuService {
+ // Read Operations
+ rpc GetGpu(GetGpuRequest) returns (Gpu);
+ rpc ListGpus(ListGpusRequest) returns (ListGpusResponse);
+ rpc WatchGpus(WatchGpusRequest) returns (stream WatchGpusResponse);
+
+ // Write Operations
+ rpc CreateGpu(CreateGpuRequest) returns (CreateGpuResponse);
+ rpc UpdateGpu(UpdateGpuRequest) returns (Gpu);
+ rpc UpdateGpuStatus(UpdateGpuStatusRequest) returns (Gpu);
+ rpc DeleteGpu(DeleteGpuRequest) returns (google.protobuf.Empty);
+}
+
+message CreateGpuRequest {
+ Gpu gpu = 1; // metadata.name and spec.uuid required
+}
+
+message CreateGpuResponse {
+ Gpu gpu = 1;
+ bool created = 2; // true if new, false if already existed
+}
+
+message UpdateGpuRequest {
+ Gpu gpu = 1; // includes resource_version for optimistic concurrency
+}
+
+message UpdateGpuStatusRequest {
+ string name = 1;
+ GpuStatus status = 2;
+ int64 resource_version = 3; // optional, for conflict detection
+}
+
+message DeleteGpuRequest {
+ string name = 1;
+}
+```
+
+**Design Rationale**:
+- Single service simplifies API surface and tooling compatibility
+- Standard CRUD verbs enable better integration with Kubernetes patterns
+- `UpdateGpuStatus` follows the Kubernetes subresource pattern
+- Optimistic concurrency via `resource_version` prevents lost updates
+
+---
+
+## Observability
+
+### Metrics Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│ Device API Server │
+│ │
+│ ┌─────────────────────────────────────────────────────────┐ │
+│ │ gRPC Interceptors │ │
+│ │ grpc_server_started_total │ │
+│ │ grpc_server_handled_total │ │
+│ │ grpc_server_handling_seconds_bucket │ │
+│ └─────────────────────────────────────────────────────────┘ │
+│ │
+│ ┌─────────────────────────────────────────────────────────┐ │
+│ │ Custom Metrics │ │
+│ │ device_api_server_cache_gpus_total │ │
+│ │ device_api_server_cache_lock_contention_total │ │
+│ │ device_api_server_watch_streams_active │ │
+│ └─────────────────────────────────────────────────────────┘ │
+│ │
+│ ┌─────────────────────────────────────────────────────────┐ │
+│ │ Go Runtime Metrics │ │
+│ │ go_goroutines │ │
+│ │ go_memstats_alloc_bytes │ │
+│ │ process_cpu_seconds_total │ │
+│ └─────────────────────────────────────────────────────────┘ │
+│ │ │
+│ ▼ │
+│ :9090/metrics │
+│ │ │
+└──────────────────────────────┼───────────────────────────────────┘
+ │
+ ▼
+┌─────────────────────────────────────────────────────────────────┐
+│ Prometheus │
+│ │
+│ ServiceMonitor ──► scrape_configs │
+│ │
+│ PrometheusRule ──► alerting_rules │
+│ │
+└─────────────────────────────────────────────────────────────────┘
+ │
+ ▼
+┌─────────────────────────────────────────────────────────────────┐
+│ Grafana │
+│ │
+│ Dashboard: Device API Server Overview │
+│ - Request rate / error rate │
+│ - P50/P99 latency │
+│ - GPU health summary │
+│ - Cache statistics │
+│ - Active watch streams │
+│ │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Deployment
+
+### Helm Values (Key Configuration)
+
+```yaml
+# values.yaml
+replicaCount: 1 # DaemonSet ignores this, but kept for consistency
+
+image:
+ repository: ghcr.io/nvidia/device-api-server
+ tag: "" # Defaults to Chart appVersion
+ pullPolicy: IfNotPresent
+
+# Server configuration
+server:
+ # gRPC listen address (TCP) - localhost only by default for security
+ # Set to ":50051" to bind to all interfaces (WARNING: unauthenticated API)
+ grpcAddress: "127.0.0.1:50051"
+ # Unix socket path (primary for node-local)
+ unixSocket: /var/run/device-api/device.sock
+ # Health probe port
+ healthPort: 8081
+ # Metrics port
+ metricsPort: 9090
+
+# Logging
+logging:
+ # Log level (0=info, higher=more verbose)
+ verbosity: 0
+ # Output format: text, json
+ format: json
+
+# Node selection
+nodeSelector:
+ nvidia.com/gpu.present: "true"
+
+tolerations:
+ - key: nvidia.com/gpu
+ operator: Exists
+ effect: NoSchedule
+
+resources:
+ requests:
+ cpu: 50m
+ memory: 64Mi
+ limits:
+ cpu: 200m
+ memory: 256Mi
+
+# Security
+securityContext:
+ runAsNonRoot: true
+ runAsUser: 65534
+ readOnlyRootFilesystem: true
+ allowPrivilegeEscalation: false
+
+# RBAC
+serviceAccount:
+ create: true
+ name: ""
+ automountServiceAccountToken: false
+
+rbac:
+ create: true
+
+# Observability
+metrics:
+ enabled: true
+ serviceMonitor:
+ enabled: true
+ interval: 30s
+ scrapeTimeout: 10s
+ prometheusRule:
+ enabled: true
+
+# Health probes
+probes:
+ liveness:
+ initialDelaySeconds: 5
+ periodSeconds: 10
+ readiness:
+ initialDelaySeconds: 5
+ periodSeconds: 10
+```
+
+### DaemonSet Topology
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│ Kubernetes Cluster │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│ │
+│ ┌───────────────────────┐ ┌───────────────────────┐ ┌───────────────────────┐│
+│ │ GPU Node 1 │ │ GPU Node 2 │ │ GPU Node 3 ││
+│ │ │ │ │ │ ││
+│ │ ┌─────────────────┐ │ │ ┌─────────────────┐ │ │ ┌─────────────────┐ ││
+│ │ │ device-api- │ │ │ │ device-api- │ │ │ │ device-api- │ ││
+│ │ │ server pod │ │ │ │ server pod │ │ │ │ server pod │ ││
+│ │ │ │ │ │ │ │ │ │ │ │ ││
+│ │ │ GPU-0: Healthy │ │ │ │ GPU-0: Healthy │ │ │ │ GPU-0: Unhealthy│ ││
+│ │ │ GPU-1: Healthy │ │ │ │ GPU-1: Healthy │ │ │ │ GPU-1: Healthy │ ││
+│ │ │ GPU-2: Healthy │ │ │ │ │ │ │ │ GPU-2: Healthy │ ││
+│ │ │ GPU-3: Healthy │ │ │ │ │ │ │ │ GPU-3: Healthy │ ││
+│ │ └─────────────────┘ │ │ └─────────────────┘ │ │ └─────────────────┘ ││
+│ │ │ │ │ │ ││
+│ │ /var/run/device-api/ │ │ /var/run/device-api/ │ │ /var/run/device-api/ ││
+│ │ device.sock │ │ device.sock │ │ device.sock ││
+│ │ │ │ │ │ ││
+│ └───────────────────────┘ └───────────────────────┘ └───────────────────────┘│
+│ │
+│ ┌───────────────────────┐ │
+│ │ Non-GPU Node │ (DaemonSet does NOT schedule here due to │
+│ │ (No GPU) │ nodeSelector: nvidia.com/gpu.present=true) │
+│ └───────────────────────┘ │
+│ │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Risk Assessment
+
+| Risk | Impact | Likelihood | Mitigation |
+|------|--------|------------|------------|
+| Cache corruption on concurrent writes | High | Low | RWMutex provides exclusivity |
+| Watch stream memory leak | Medium | Medium | Bounded channels, timeouts |
+| Provider not updating (stale data) | High | Medium | Health checks, provider heartbeat (Phase 2) |
+| Socket permission issues | Medium | Medium | Init container for socket dir |
+| High lock contention | Medium | Low | Metrics to detect, sharding if needed |
+
+---
+
+## Success Criteria
+
+### Phase 1
+- [ ] Server starts and accepts gRPC connections
+- [ ] Provider can register/update/unregister GPUs
+- [ ] Consumer can Get/List/Watch GPUs
+- [ ] Read-blocking verified under concurrent load
+
+### Phase 2
+- [ ] Structured logs with klog/v2
+- [ ] Health probes pass in Kubernetes
+- [ ] Unix socket communication works
+
+### Phase 3
+- [ ] Prometheus metrics exposed
+- [ ] Grafana dashboard visualizes key metrics
+- [ ] Alerts fire correctly in test scenarios
+
+### Phase 4
+- [ ] `helm install` works out of box
+- [ ] DaemonSet schedules on GPU nodes only
+- [ ] RBAC properly scoped
+
+### Phase 5
+- [ ] Documentation complete
+- [ ] CI/CD pipeline green
+- [ ] Performance benchmarks pass
+
+---
+
+## Appendix: Research References
+
+1. **Kubernetes DaemonSet gRPC Best Practices** - Health probes, graceful shutdown, load balancing
+2. **Go sync.RWMutex** - Writer-preference semantics, blocking behavior
+3. **klog/v2** - Structured logging, contextual logging, JSON format
+4. **Helm Chart Best Practices** - RBAC, ServiceAccount, DaemonSet templates
+5. **grpc-prometheus** - Metrics interceptors, histogram configuration
+
+---
+
+*Document version: 1.0*
+*Last updated: 2026-01-21*
diff --git a/docs/operations/device-api-server.md b/docs/operations/device-api-server.md
new file mode 100644
index 000000000..96df4804a
--- /dev/null
+++ b/docs/operations/device-api-server.md
@@ -0,0 +1,358 @@
+# Device API Server - Operations Guide
+
+This guide covers deployment, configuration, monitoring, and troubleshooting of the Device API Server.
+
+## Architecture Overview
+
+The Device API Server is a pure Go gRPC server with no hardware dependencies.
+GPU enumeration and health monitoring is provided by external providers (sidecars).
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ GPU Node │
+│ ┌─────────────────────────────────────────────────────────┐│
+│ │ Device API Server (DaemonSet) ││
+│ │ ┌─────────────────────────────────────────────────┐ ││
+│ │ │ GpuService (unified) │ ││
+│ │ │ Read: GetGpu, ListGpus, WatchGpus │ ││
+│ │ │ Write: CreateGpu, UpdateGpuStatus, DeleteGpu │ ││
+│ │ └────────────────────┬────────────────────────────┘ ││
+│ │ │ ││
+│ │ ▼ ││
+│ │ ┌─────────────────────────────────────────────────────┐││
+│ │ │ GPU Cache (RWMutex) │││
+│ │ │ - Read-blocking during writes │││
+│ │ │ - Watch event broadcasting │││
+│ │ └─────────────────────────────────────────────────────┘││
+│ └─────────────────────────────────────────────────────────┘│
+│ │
+│ Providers (gRPC clients): │
+│ - nvml-provider sidecar (GPU enumeration, XID monitoring) │
+│ - Custom providers (CreateGpu, UpdateGpuStatus) │
+│ │
+│ Consumers (gRPC clients): │
+│ - Device plugins (GetGpu, ListGpus, WatchGpus) │
+│ - DRA drivers (GetGpu, ListGpus, WatchGpus) │
+└─────────────────────────────────────────────────────────────┘
+```
+
+## Deployment
+
+### Prerequisites
+
+- Kubernetes 1.25+
+- Helm 3.0+
+- GPU nodes with label `nvidia.com/gpu.present=true`
+- (Optional) Prometheus Operator for monitoring
+
+### Installation
+
+**Basic Installation**:
+
+```bash
+helm install device-api-server ./deployments/helm/device-api-server \
+ --namespace device-api --create-namespace
+```
+
+**With Prometheus Monitoring**:
+
+```bash
+helm install device-api-server ./deployments/helm/device-api-server \
+ --namespace device-api --create-namespace \
+ --set metrics.serviceMonitor.enabled=true \
+ --set metrics.prometheusRule.enabled=true
+```
+
+### Verify Installation
+
+```bash
+# Check DaemonSet status
+kubectl get daemonset -n device-api
+
+# Check pods are running on GPU nodes
+kubectl get pods -n device-api -o wide
+
+# Check logs
+kubectl logs -n device-api -l app.kubernetes.io/name=device-api-server
+```
+
+---
+
+## Configuration
+
+### Command-Line Flags
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--bind-address` | `unix:///var/run/nvidia-device-api/device-api.sock` | Unix socket URI for the gRPC device API |
+| `--health-probe-bind-address` | `:50051` | TCP address for gRPC health and reflection |
+| `--metrics-bind-address` | `:9090` | TCP address for HTTP Prometheus metrics |
+| `--shutdown-grace-period` | `25s` | Maximum time to wait for graceful shutdown |
+| `--hostname-override` | (auto-detected) | Override the node hostname (must be a valid DNS subdomain) |
+| `-v` | `0` | Log verbosity level (klog) |
+
+### Helm Values
+
+See [values.yaml](../../deployments/helm/device-api-server/values.yaml) for the complete reference.
+
+Key configuration sections:
+
+```yaml
+# Server configuration
+server:
+ unixSocket: /var/run/device-api/device.sock
+ healthPort: 8081
+ metricsPort: 9090
+ shutdownGracePeriod: 25
+ shutdownDelay: 5
+
+# Node scheduling
+nodeSelector:
+ nvidia.com/gpu.present: "true"
+
+# Resources
+resources:
+ requests:
+ cpu: 50m
+ memory: 64Mi
+ limits:
+ cpu: 200m
+ memory: 256Mi
+```
+
+---
+
+## GPU Providers
+
+The Device API Server is a pure Go gRPC server with no hardware dependencies.
+GPU enumeration and health monitoring is provided by external providers that connect
+as gRPC clients:
+
+- **nvml-provider sidecar** - Recommended NVML-based provider for GPU enumeration and XID monitoring
+- **Custom providers** - Any gRPC client can register GPUs via `CreateGpu` and update health via `UpdateGpuStatus`
+
+See the [nvml-provider demo](../../demos/nvml-sidecar-demo.sh) for an example sidecar deployment.
+
+---
+
+## Monitoring
+
+### Health Endpoints
+
+| Endpoint | Port | Description |
+|----------|------|-------------|
+| `/healthz` | 8081 | Liveness probe - server is running |
+| `/readyz` | 8081 | Readiness probe - server is accepting traffic |
+| `/metrics` | 9090 | Prometheus metrics |
+
+### Prometheus Metrics
+
+**Server Metrics**:
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `device_api_server_info` | Gauge | Server information (version, go_version) |
+
+**Cache Metrics**:
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `device_api_server_cache_gpus_total` | Gauge | Total GPUs in cache |
+| `device_api_server_cache_gpus_healthy` | Gauge | Healthy GPUs |
+| `device_api_server_cache_gpus_unhealthy` | Gauge | Unhealthy GPUs |
+| `device_api_server_cache_gpus_unknown` | Gauge | GPUs with unknown status |
+| `device_api_server_cache_updates_total` | Counter | Cache update operations |
+| `device_api_server_cache_resource_version` | Gauge | Current cache version |
+
+**Watch Metrics**:
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `device_api_server_watch_streams_active` | Gauge | Active watch streams |
+| `device_api_server_watch_events_total` | Counter | Watch events sent |
+
+### Alerting Rules
+
+When `metrics.prometheusRule.enabled=true`, the following alerts are created:
+
+| Alert | Severity | Condition |
+|-------|----------|-----------|
+| `DeviceAPIServerDown` | Critical | Server unreachable for 5m |
+| `DeviceAPIServerHighLatency` | Warning | P99 latency > 500ms |
+| `DeviceAPIServerHighErrorRate` | Warning | Error rate > 10% |
+| `DeviceAPIServerUnhealthyGPUs` | Warning | Unhealthy GPUs > 0 |
+| `DeviceAPIServerNoGPUs` | Warning | No GPUs for 10m |
+| `DeviceAPIServerHighMemory` | Warning | Memory > 512MB |
+
+### Grafana Dashboard
+
+Example PromQL queries for dashboards:
+
+```promql
+# GPU health overview
+device_api_server_cache_gpus_healthy / device_api_server_cache_gpus_total * 100
+
+# Watch stream activity
+rate(device_api_server_watch_events_total[5m])
+
+# Cache update rate
+rate(device_api_server_cache_updates_total[5m])
+```
+
+---
+
+## Troubleshooting
+
+### Pod Not Scheduling
+
+**Symptom**: DaemonSet shows 0/N pods ready
+
+**Check**:
+
+```bash
+# Verify node labels
+kubectl get nodes --show-labels | grep gpu
+
+# Check DaemonSet events
+kubectl describe daemonset -n device-api device-api-server
+```
+
+**Solution**: Ensure nodes have `nvidia.com/gpu.present=true` label or override `nodeSelector`.
+
+### Permission Denied on Unix Socket
+
+**Symptom**: Clients cannot connect to Unix socket
+
+**Check**:
+
+```bash
+# Check socket permissions on node
+ls -la /var/run/device-api/
+```
+
+**Solution**: Verify `securityContext` allows socket creation, or adjust `runAsUser`.
+
+### GPUs Not Appearing
+
+**Symptom**: `ListGpus` returns empty
+
+**Check**:
+
+```bash
+# Check for GPU enumeration errors
+kubectl logs -n device-api | grep -i error
+
+# Check if provider sidecar is running
+kubectl get pods -n device-api -o wide
+```
+
+**Solutions**:
+1. Deploy the nvml-provider sidecar: see [nvml-provider demo](../../demos/nvml-sidecar-demo.sh)
+2. Deploy an external health provider
+3. Verify the provider can connect to the Device API Server
+
+### High Memory Usage
+
+**Symptom**: Pod OOMKilled or memory alerts firing
+
+**Check**:
+
+```bash
+# Check current memory usage
+kubectl top pods -n device-api
+
+# Check watch stream count
+curl -s http://:9090/metrics | grep watch_streams
+```
+
+**Solutions**:
+1. Increase memory limits
+2. Investigate clients creating excessive watch streams
+3. Check for memory leaks in logs
+
+### Watch Stream Disconnections
+
+**Symptom**: Consumers report frequent reconnections
+
+**Check**:
+
+```bash
+# Check network policy
+kubectl get networkpolicy -n device-api
+
+# Check for errors in logs
+kubectl logs -n device-api | grep -i "stream\|watch"
+```
+
+**Solutions**:
+1. Ensure network policies allow intra-node traffic
+2. Check client timeout settings
+3. Verify server is not overloaded
+
+---
+
+## Graceful Shutdown
+
+The server implements graceful shutdown:
+
+1. **PreStop Hook**: Sleeps for `shutdownDelay` seconds
+2. **Signal Handling**: Catches SIGTERM/SIGINT
+3. **Drain Period**: Stops accepting new connections
+4. **In-Flight Completion**: Waits for active requests (up to `shutdownTimeout`)
+5. **Resource Cleanup**: Closes connections
+
+**Timeline**:
+
+```
+SIGTERM → [shutdownDelay] → Stop listeners → [shutdownGracePeriod] → Force close
+```
+
+Configure in Helm:
+
+```yaml
+server:
+ shutdownGracePeriod: 25 # Max wait for in-flight requests (seconds)
+ shutdownDelay: 5 # Pre-shutdown delay for endpoint propagation (seconds)
+```
+
+---
+
+## Security Considerations
+
+### Pod Security
+
+Default security context (non-root, restricted):
+
+```yaml
+securityContext:
+ runAsNonRoot: true
+ runAsUser: 65534
+ runAsGroup: 65534
+ readOnlyRootFilesystem: true
+ allowPrivilegeEscalation: false
+ capabilities:
+ drop:
+ - ALL
+```
+
+### Network Security
+
+> **Warning**: The gRPC API is unauthenticated.
+
+- The gRPC device API binds to a **Unix domain socket** by default (`--bind-address=unix:///var/run/nvidia-device-api/device-api.sock`). This limits access to processes on the same node.
+- The health probe endpoint (`--health-probe-bind-address`) binds to a TCP port for kubelet probes but only serves gRPC health and reflection, not the device API.
+- In multi-tenant or partially untrusted clusters, use a Kubernetes `NetworkPolicy` to restrict access to the health and metrics TCP ports.
+
+### Service Account
+
+- `automountServiceAccountToken: false` by default
+- No Kubernetes API access required
+
+---
+
+## See Also
+
+- [API Reference](../api/device-api-server.md)
+- [Design Document](../design/device-api-server.md)
+- [Helm Chart README](../../deployments/helm/device-api-server/README.md)
+- [NVML Sidecar Demo](../../demos/nvml-sidecar-demo.sh)
diff --git a/examples/fake-client/main_test.go b/examples/fake-client/main_test.go
index c552f566a..bc80953fe 100644
--- a/examples/fake-client/main_test.go
+++ b/examples/fake-client/main_test.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@ package main_test
import (
"context"
+ "sync"
"testing"
"time"
@@ -31,6 +32,82 @@ import (
"k8s.io/client-go/tools/cache"
)
+// bookmarkWatch wraps a watch.Interface to inject a bookmark event after
+// creation. This is needed because k8s.io/client-go v0.35+ requires bookmark
+// events for the reflector to consider initial sync complete, but the fake
+// client's ObjectTracker doesn't send them automatically.
+type bookmarkWatch struct {
+ watch.Interface
+ bookmarkCh chan watch.Event
+ resultCh chan watch.Event
+ stopCh chan struct{}
+ stopOnce sync.Once
+}
+
+func newBookmarkWatch(w watch.Interface) *bookmarkWatch {
+ bw := &bookmarkWatch{
+ Interface: w,
+ bookmarkCh: make(chan watch.Event, 1),
+ resultCh: make(chan watch.Event),
+ stopCh: make(chan struct{}),
+ }
+
+ // Send initial bookmark to signal list completion.
+ // The bookmark object must be the same type as the expected resource (GPU).
+ bw.bookmarkCh <- watch.Event{
+ Type: watch.Bookmark,
+ Object: &devicev1alpha1.GPU{
+ ObjectMeta: metav1.ObjectMeta{
+ ResourceVersion: "0",
+ Annotations: map[string]string{
+ metav1.InitialEventsAnnotationKey: "true",
+ },
+ },
+ },
+ }
+
+ // Multiplex bookmark and underlying watch events
+ go func() {
+ defer close(bw.resultCh)
+ for {
+ select {
+ case <-bw.stopCh:
+ return
+ case ev, ok := <-bw.bookmarkCh:
+ if ok {
+ select {
+ case bw.resultCh <- ev:
+ case <-bw.stopCh:
+ return
+ }
+ }
+ case ev, ok := <-w.ResultChan():
+ if !ok {
+ return
+ }
+ select {
+ case bw.resultCh <- ev:
+ case <-bw.stopCh:
+ return
+ }
+ }
+ }
+ }()
+
+ return bw
+}
+
+func (bw *bookmarkWatch) ResultChan() <-chan watch.Event {
+ return bw.resultCh
+}
+
+func (bw *bookmarkWatch) Stop() {
+ bw.stopOnce.Do(func() {
+ close(bw.stopCh)
+ })
+ bw.Interface.Stop()
+}
+
func TestGPUInformerWithFakeClient(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
@@ -47,6 +124,10 @@ func TestGPUInformerWithFakeClient(t *testing.T) {
// signal the test when the informer has successfully established its
// stream, preventing race conditions where events are injected before
// the watcher is ready.
+ //
+ // The reactor also wraps the watch to inject a bookmark event, which is
+ // required by k8s.io/client-go v0.35+ for the reflector to consider the
+ // initial sync complete.
client.PrependWatchReactor("*", func(action clienttesting.Action) (handled bool, ret watch.Interface, err error) {
watchAction, ok := action.(clienttesting.WatchActionImpl)
if !ok {
@@ -58,15 +139,18 @@ func TestGPUInformerWithFakeClient(t *testing.T) {
ns := action.GetNamespace()
// Manually invoke the tracker to create the watch stream.
- watch, err := client.Tracker().Watch(gvr, ns, opts)
+ w, err := client.Tracker().Watch(gvr, ns, opts)
if err != nil {
return false, nil, err
}
+ // Wrap watch to inject initial bookmark event for reflector sync
+ wrappedWatch := newBookmarkWatch(w)
+
// Close the channel to notify the test that the Informer is now
// listening for events.
close(watcherStarted)
- return true, watch, nil
+ return true, wrappedWatch, nil
})
// Create a factory for the informers.
diff --git a/go.mod b/go.mod
index d1f0ae9d1..23a936b27 100644
--- a/go.mod
+++ b/go.mod
@@ -1,20 +1,20 @@
module github.com/nvidia/nvsentinel
-go 1.25.5
+go 1.25.0
require (
+ github.com/NVIDIA/go-nvml v0.12.9-0
github.com/go-logr/logr v1.4.3
github.com/go-logr/stdr v1.2.2
github.com/google/go-cmp v0.7.0
github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1
github.com/k3s-io/kine v1.14.2
github.com/prometheus/client_golang v1.23.2
- github.com/spf13/cobra v1.10.2
- github.com/spf13/pflag v1.0.9
+ github.com/spf13/pflag v1.0.10
go.uber.org/goleak v1.3.0
golang.org/x/sync v0.18.0
- google.golang.org/grpc v1.78.0
- google.golang.org/protobuf v1.36.11
+ google.golang.org/grpc v1.77.0
+ google.golang.org/protobuf v1.36.10
k8s.io/apimachinery v0.35.0
k8s.io/apiserver v0.35.0
k8s.io/client-go v0.35.0
@@ -26,7 +26,6 @@ require (
require (
cel.dev/expr v0.24.0 // indirect
filippo.io/edwards25519 v1.1.0 // indirect
- github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect
github.com/NYTimes/gziphandler v1.1.1 // indirect
github.com/Rican7/retry v0.3.1 // indirect
github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
@@ -75,7 +74,6 @@ require (
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mattn/go-sqlite3 v1.14.32 // indirect
github.com/minio/highwayhash v1.0.3 // indirect
- github.com/moby/term v0.5.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
@@ -93,6 +91,7 @@ require (
github.com/shengdoushi/base58 v1.0.0 // indirect
github.com/sirupsen/logrus v1.9.3 // indirect
github.com/soheilhy/cmux v0.1.5 // indirect
+ github.com/spf13/cobra v1.10.0 // indirect
github.com/stoewer/go-strcase v1.3.0 // indirect
github.com/tidwall/btree v1.8.1 // indirect
github.com/tmc/grpc-websocket-proxy v0.0.0-20220101234140-673ab2c3ae75 // indirect
@@ -128,7 +127,7 @@ require (
golang.org/x/text v0.31.0 // indirect
golang.org/x/time v0.12.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
- google.golang.org/genproto/googleapis/api v0.0.0-20251029180050-ab9386a59fda // indirect
+ google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 // indirect
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
diff --git a/go.sum b/go.sum
index 7c99213db..cb1e0c3fa 100644
--- a/go.sum
+++ b/go.sum
@@ -2,10 +2,10 @@ cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY=
cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw=
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
-github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0=
-github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
+github.com/NVIDIA/go-nvml v0.12.9-0 h1:e344UK8ZkeMeeLkdQtRhmXRxNf+u532LDZPGMtkdus0=
+github.com/NVIDIA/go-nvml v0.12.9-0/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I=
github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
github.com/Rican7/retry v0.3.1 h1:scY4IbO8swckzoA/11HgBwaZRJEyY9vaNJshcdhp1Mc=
@@ -32,8 +32,6 @@ github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
-github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY=
-github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -151,8 +149,6 @@ github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuE
github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/minio/highwayhash v1.0.3 h1:kbnuUMoHYyVl7szWjSxJnxw11k2U709jqFPPmIUyD6Q=
github.com/minio/highwayhash v1.0.3/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ=
-github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
-github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -201,10 +197,11 @@ github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/soheilhy/cmux v0.1.5 h1:jjzc5WVemNEDTLwv9tlmemhC73tI08BNOIGwBOo10Js=
github.com/soheilhy/cmux v0.1.5/go.mod h1:T7TcVDs9LWfQgPlPsdngu6I6QIoyIFZDDC6sNE1GqG0=
-github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU=
-github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4=
-github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY=
-github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/spf13/cobra v1.10.0 h1:a5/WeUlSDCvV5a45ljW2ZFtV0bTDpkfSAj3uqB6Sc+0=
+github.com/spf13/cobra v1.10.0/go.mod h1:9dhySC7dnTtEiqzmqfkLj47BslqLCUPMXjG2lj/NgoE=
+github.com/spf13/pflag v1.0.8/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk=
+github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs=
github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@@ -309,7 +306,6 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
@@ -338,14 +334,14 @@ gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw
gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
-google.golang.org/genproto/googleapis/api v0.0.0-20251029180050-ab9386a59fda h1:+2XxjfsAu6vqFxwGBRcHiMaDCuZiqXGDUDVWVtrFAnE=
-google.golang.org/genproto/googleapis/api v0.0.0-20251029180050-ab9386a59fda/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo=
+google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 h1:mepRgnBZa07I4TRuomDE4sTIYieg/osKmzIf4USdWS4=
+google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo=
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 h1:Wgl1rcDNThT+Zn47YyCXOXyX/COgMTIdhJ717F0l4xk=
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
-google.golang.org/grpc v1.78.0 h1:K1XZG/yGDJnzMdd/uZHAkVqJE+xIDOcmdSFZkBUicNc=
-google.golang.org/grpc v1.78.0/go.mod h1:I47qjTo4OKbMkjA/aOOwxDIiPSBofUtQUI5EfpWvW7U=
-google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
-google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
+google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM=
+google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig=
+google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
+google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
diff --git a/hack/boilerplate.go.txt b/hack/boilerplate.go.txt
index e1732e8d5..6307eef7b 100644
--- a/hack/boilerplate.go.txt
+++ b/hack/boilerplate.go.txt
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/internal/generated/device/v1alpha1/gpu.pb.go b/internal/generated/device/v1alpha1/gpu.pb.go
index 17419e268..d184d3eb9 100644
--- a/internal/generated/device/v1alpha1/gpu.pb.go
+++ b/internal/generated/device/v1alpha1/gpu.pb.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
// Code generated by protoc-gen-go. DO NOT EDIT.
// versions:
// protoc-gen-go v1.36.10
-// protoc v6.33.0
+// protoc v6.33.4
// source: device/v1alpha1/gpu.proto
package v1alpha1
@@ -1173,6 +1173,64 @@ func (x *UpdateGpuRequest) GetOpts() *UpdateOptions {
return nil
}
+// UpdateGpuStatusRequest specifies the GPU whose status should be updated.
+// Only metadata (name, namespace, resource_version) and status fields are used.
+type UpdateGpuStatusRequest struct {
+ state protoimpl.MessageState `protogen:"open.v1"`
+ // gpu is the GPU resource with updated status.
+ // The server reads metadata.name, metadata.namespace, metadata.resource_version
+ // and status from this object. All other fields are ignored.
+ Gpu *Gpu `protobuf:"bytes,1,opt,name=gpu,proto3" json:"gpu,omitempty"`
+ // opts contains the options for the update.
+ Opts *UpdateOptions `protobuf:"bytes,2,opt,name=opts,proto3" json:"opts,omitempty"`
+ unknownFields protoimpl.UnknownFields
+ sizeCache protoimpl.SizeCache
+}
+
+func (x *UpdateGpuStatusRequest) Reset() {
+ *x = UpdateGpuStatusRequest{}
+ mi := &file_device_v1alpha1_gpu_proto_msgTypes[20]
+ ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+ ms.StoreMessageInfo(mi)
+}
+
+func (x *UpdateGpuStatusRequest) String() string {
+ return protoimpl.X.MessageStringOf(x)
+}
+
+func (*UpdateGpuStatusRequest) ProtoMessage() {}
+
+func (x *UpdateGpuStatusRequest) ProtoReflect() protoreflect.Message {
+ mi := &file_device_v1alpha1_gpu_proto_msgTypes[20]
+ if x != nil {
+ ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+ if ms.LoadMessageInfo() == nil {
+ ms.StoreMessageInfo(mi)
+ }
+ return ms
+ }
+ return mi.MessageOf(x)
+}
+
+// Deprecated: Use UpdateGpuStatusRequest.ProtoReflect.Descriptor instead.
+func (*UpdateGpuStatusRequest) Descriptor() ([]byte, []int) {
+ return file_device_v1alpha1_gpu_proto_rawDescGZIP(), []int{20}
+}
+
+func (x *UpdateGpuStatusRequest) GetGpu() *Gpu {
+ if x != nil {
+ return x.Gpu
+ }
+ return nil
+}
+
+func (x *UpdateGpuStatusRequest) GetOpts() *UpdateOptions {
+ if x != nil {
+ return x.Opts
+ }
+ return nil
+}
+
type DeleteGpuRequest struct {
state protoimpl.MessageState `protogen:"open.v1"`
// The unique resource name of the GPU to delete.
@@ -1190,7 +1248,7 @@ type DeleteGpuRequest struct {
func (x *DeleteGpuRequest) Reset() {
*x = DeleteGpuRequest{}
- mi := &file_device_v1alpha1_gpu_proto_msgTypes[20]
+ mi := &file_device_v1alpha1_gpu_proto_msgTypes[21]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
@@ -1202,7 +1260,7 @@ func (x *DeleteGpuRequest) String() string {
func (*DeleteGpuRequest) ProtoMessage() {}
func (x *DeleteGpuRequest) ProtoReflect() protoreflect.Message {
- mi := &file_device_v1alpha1_gpu_proto_msgTypes[20]
+ mi := &file_device_v1alpha1_gpu_proto_msgTypes[21]
if x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
@@ -1215,7 +1273,7 @@ func (x *DeleteGpuRequest) ProtoReflect() protoreflect.Message {
// Deprecated: Use DeleteGpuRequest.ProtoReflect.Descriptor instead.
func (*DeleteGpuRequest) Descriptor() ([]byte, []int) {
- return file_device_v1alpha1_gpu_proto_rawDescGZIP(), []int{20}
+ return file_device_v1alpha1_gpu_proto_rawDescGZIP(), []int{21}
}
func (x *DeleteGpuRequest) GetName() string {
@@ -1306,18 +1364,22 @@ const file_device_v1alpha1_gpu_proto_rawDesc = "" +
"\x04opts\x18\x02 \x01(\v2).nvidia.nvsentinel.v1alpha1.CreateOptionsR\x04opts\"\x84\x01\n" +
"\x10UpdateGpuRequest\x121\n" +
"\x03gpu\x18\x01 \x01(\v2\x1f.nvidia.nvsentinel.v1alpha1.GpuR\x03gpu\x12=\n" +
+ "\x04opts\x18\x02 \x01(\v2).nvidia.nvsentinel.v1alpha1.UpdateOptionsR\x04opts\"\x8a\x01\n" +
+ "\x16UpdateGpuStatusRequest\x121\n" +
+ "\x03gpu\x18\x01 \x01(\v2\x1f.nvidia.nvsentinel.v1alpha1.GpuR\x03gpu\x12=\n" +
"\x04opts\x18\x02 \x01(\v2).nvidia.nvsentinel.v1alpha1.UpdateOptionsR\x04opts\"\x83\x01\n" +
"\x10DeleteGpuRequest\x12\x12\n" +
"\x04name\x18\x01 \x01(\tR\x04name\x12\x1c\n" +
"\tnamespace\x18\x02 \x01(\tR\tnamespace\x12=\n" +
- "\x04opts\x18\x03 \x01(\v2).nvidia.nvsentinel.v1alpha1.DeleteOptionsR\x04opts2\xcb\x04\n" +
+ "\x04opts\x18\x03 \x01(\v2).nvidia.nvsentinel.v1alpha1.DeleteOptionsR\x04opts2\xb3\x05\n" +
"\n" +
"GpuService\x12_\n" +
"\x06GetGpu\x12).nvidia.nvsentinel.v1alpha1.GetGpuRequest\x1a*.nvidia.nvsentinel.v1alpha1.GetGpuResponse\x12e\n" +
"\bListGpus\x12+.nvidia.nvsentinel.v1alpha1.ListGpusRequest\x1a,.nvidia.nvsentinel.v1alpha1.ListGpusResponse\x12j\n" +
"\tWatchGpus\x12,.nvidia.nvsentinel.v1alpha1.WatchGpusRequest\x1a-.nvidia.nvsentinel.v1alpha1.WatchGpusResponse0\x01\x12Z\n" +
"\tCreateGpu\x12,.nvidia.nvsentinel.v1alpha1.CreateGpuRequest\x1a\x1f.nvidia.nvsentinel.v1alpha1.Gpu\x12Z\n" +
- "\tUpdateGpu\x12,.nvidia.nvsentinel.v1alpha1.UpdateGpuRequest\x1a\x1f.nvidia.nvsentinel.v1alpha1.Gpu\x12Q\n" +
+ "\tUpdateGpu\x12,.nvidia.nvsentinel.v1alpha1.UpdateGpuRequest\x1a\x1f.nvidia.nvsentinel.v1alpha1.Gpu\x12f\n" +
+ "\x0fUpdateGpuStatus\x122.nvidia.nvsentinel.v1alpha1.UpdateGpuStatusRequest\x1a\x1f.nvidia.nvsentinel.v1alpha1.Gpu\x12Q\n" +
"\tDeleteGpu\x12,.nvidia.nvsentinel.v1alpha1.DeleteGpuRequest\x1a\x16.google.protobuf.EmptyBJZHgithub.com/nvidia/nvsentinel/internal/generated/device/v1alpha1;v1alpha1b\x06proto3"
var (
@@ -1332,41 +1394,42 @@ func file_device_v1alpha1_gpu_proto_rawDescGZIP() []byte {
return file_device_v1alpha1_gpu_proto_rawDescData
}
-var file_device_v1alpha1_gpu_proto_msgTypes = make([]protoimpl.MessageInfo, 21)
+var file_device_v1alpha1_gpu_proto_msgTypes = make([]protoimpl.MessageInfo, 22)
var file_device_v1alpha1_gpu_proto_goTypes = []any{
- (*ObjectMeta)(nil), // 0: nvidia.nvsentinel.v1alpha1.ObjectMeta
- (*ListMeta)(nil), // 1: nvidia.nvsentinel.v1alpha1.ListMeta
- (*GetOptions)(nil), // 2: nvidia.nvsentinel.v1alpha1.GetOptions
- (*ListOptions)(nil), // 3: nvidia.nvsentinel.v1alpha1.ListOptions
- (*CreateOptions)(nil), // 4: nvidia.nvsentinel.v1alpha1.CreateOptions
- (*UpdateOptions)(nil), // 5: nvidia.nvsentinel.v1alpha1.UpdateOptions
- (*DeleteOptions)(nil), // 6: nvidia.nvsentinel.v1alpha1.DeleteOptions
- (*Gpu)(nil), // 7: nvidia.nvsentinel.v1alpha1.Gpu
- (*GpuList)(nil), // 8: nvidia.nvsentinel.v1alpha1.GpuList
- (*GpuSpec)(nil), // 9: nvidia.nvsentinel.v1alpha1.GpuSpec
- (*GpuStatus)(nil), // 10: nvidia.nvsentinel.v1alpha1.GpuStatus
- (*Condition)(nil), // 11: nvidia.nvsentinel.v1alpha1.Condition
- (*GetGpuRequest)(nil), // 12: nvidia.nvsentinel.v1alpha1.GetGpuRequest
- (*GetGpuResponse)(nil), // 13: nvidia.nvsentinel.v1alpha1.GetGpuResponse
- (*ListGpusRequest)(nil), // 14: nvidia.nvsentinel.v1alpha1.ListGpusRequest
- (*ListGpusResponse)(nil), // 15: nvidia.nvsentinel.v1alpha1.ListGpusResponse
- (*WatchGpusRequest)(nil), // 16: nvidia.nvsentinel.v1alpha1.WatchGpusRequest
- (*WatchGpusResponse)(nil), // 17: nvidia.nvsentinel.v1alpha1.WatchGpusResponse
- (*CreateGpuRequest)(nil), // 18: nvidia.nvsentinel.v1alpha1.CreateGpuRequest
- (*UpdateGpuRequest)(nil), // 19: nvidia.nvsentinel.v1alpha1.UpdateGpuRequest
- (*DeleteGpuRequest)(nil), // 20: nvidia.nvsentinel.v1alpha1.DeleteGpuRequest
- (*timestamppb.Timestamp)(nil), // 21: google.protobuf.Timestamp
- (*emptypb.Empty)(nil), // 22: google.protobuf.Empty
+ (*ObjectMeta)(nil), // 0: nvidia.nvsentinel.v1alpha1.ObjectMeta
+ (*ListMeta)(nil), // 1: nvidia.nvsentinel.v1alpha1.ListMeta
+ (*GetOptions)(nil), // 2: nvidia.nvsentinel.v1alpha1.GetOptions
+ (*ListOptions)(nil), // 3: nvidia.nvsentinel.v1alpha1.ListOptions
+ (*CreateOptions)(nil), // 4: nvidia.nvsentinel.v1alpha1.CreateOptions
+ (*UpdateOptions)(nil), // 5: nvidia.nvsentinel.v1alpha1.UpdateOptions
+ (*DeleteOptions)(nil), // 6: nvidia.nvsentinel.v1alpha1.DeleteOptions
+ (*Gpu)(nil), // 7: nvidia.nvsentinel.v1alpha1.Gpu
+ (*GpuList)(nil), // 8: nvidia.nvsentinel.v1alpha1.GpuList
+ (*GpuSpec)(nil), // 9: nvidia.nvsentinel.v1alpha1.GpuSpec
+ (*GpuStatus)(nil), // 10: nvidia.nvsentinel.v1alpha1.GpuStatus
+ (*Condition)(nil), // 11: nvidia.nvsentinel.v1alpha1.Condition
+ (*GetGpuRequest)(nil), // 12: nvidia.nvsentinel.v1alpha1.GetGpuRequest
+ (*GetGpuResponse)(nil), // 13: nvidia.nvsentinel.v1alpha1.GetGpuResponse
+ (*ListGpusRequest)(nil), // 14: nvidia.nvsentinel.v1alpha1.ListGpusRequest
+ (*ListGpusResponse)(nil), // 15: nvidia.nvsentinel.v1alpha1.ListGpusResponse
+ (*WatchGpusRequest)(nil), // 16: nvidia.nvsentinel.v1alpha1.WatchGpusRequest
+ (*WatchGpusResponse)(nil), // 17: nvidia.nvsentinel.v1alpha1.WatchGpusResponse
+ (*CreateGpuRequest)(nil), // 18: nvidia.nvsentinel.v1alpha1.CreateGpuRequest
+ (*UpdateGpuRequest)(nil), // 19: nvidia.nvsentinel.v1alpha1.UpdateGpuRequest
+ (*UpdateGpuStatusRequest)(nil), // 20: nvidia.nvsentinel.v1alpha1.UpdateGpuStatusRequest
+ (*DeleteGpuRequest)(nil), // 21: nvidia.nvsentinel.v1alpha1.DeleteGpuRequest
+ (*timestamppb.Timestamp)(nil), // 22: google.protobuf.Timestamp
+ (*emptypb.Empty)(nil), // 23: google.protobuf.Empty
}
var file_device_v1alpha1_gpu_proto_depIdxs = []int32{
- 21, // 0: nvidia.nvsentinel.v1alpha1.ObjectMeta.creation_timestamp:type_name -> google.protobuf.Timestamp
+ 22, // 0: nvidia.nvsentinel.v1alpha1.ObjectMeta.creation_timestamp:type_name -> google.protobuf.Timestamp
0, // 1: nvidia.nvsentinel.v1alpha1.Gpu.metadata:type_name -> nvidia.nvsentinel.v1alpha1.ObjectMeta
9, // 2: nvidia.nvsentinel.v1alpha1.Gpu.spec:type_name -> nvidia.nvsentinel.v1alpha1.GpuSpec
10, // 3: nvidia.nvsentinel.v1alpha1.Gpu.status:type_name -> nvidia.nvsentinel.v1alpha1.GpuStatus
1, // 4: nvidia.nvsentinel.v1alpha1.GpuList.metadata:type_name -> nvidia.nvsentinel.v1alpha1.ListMeta
7, // 5: nvidia.nvsentinel.v1alpha1.GpuList.items:type_name -> nvidia.nvsentinel.v1alpha1.Gpu
11, // 6: nvidia.nvsentinel.v1alpha1.GpuStatus.conditions:type_name -> nvidia.nvsentinel.v1alpha1.Condition
- 21, // 7: nvidia.nvsentinel.v1alpha1.Condition.last_transition_time:type_name -> google.protobuf.Timestamp
+ 22, // 7: nvidia.nvsentinel.v1alpha1.Condition.last_transition_time:type_name -> google.protobuf.Timestamp
2, // 8: nvidia.nvsentinel.v1alpha1.GetGpuRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.GetOptions
7, // 9: nvidia.nvsentinel.v1alpha1.GetGpuResponse.gpu:type_name -> nvidia.nvsentinel.v1alpha1.Gpu
3, // 10: nvidia.nvsentinel.v1alpha1.ListGpusRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.ListOptions
@@ -1377,24 +1440,28 @@ var file_device_v1alpha1_gpu_proto_depIdxs = []int32{
4, // 15: nvidia.nvsentinel.v1alpha1.CreateGpuRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.CreateOptions
7, // 16: nvidia.nvsentinel.v1alpha1.UpdateGpuRequest.gpu:type_name -> nvidia.nvsentinel.v1alpha1.Gpu
5, // 17: nvidia.nvsentinel.v1alpha1.UpdateGpuRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.UpdateOptions
- 6, // 18: nvidia.nvsentinel.v1alpha1.DeleteGpuRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.DeleteOptions
- 12, // 19: nvidia.nvsentinel.v1alpha1.GpuService.GetGpu:input_type -> nvidia.nvsentinel.v1alpha1.GetGpuRequest
- 14, // 20: nvidia.nvsentinel.v1alpha1.GpuService.ListGpus:input_type -> nvidia.nvsentinel.v1alpha1.ListGpusRequest
- 16, // 21: nvidia.nvsentinel.v1alpha1.GpuService.WatchGpus:input_type -> nvidia.nvsentinel.v1alpha1.WatchGpusRequest
- 18, // 22: nvidia.nvsentinel.v1alpha1.GpuService.CreateGpu:input_type -> nvidia.nvsentinel.v1alpha1.CreateGpuRequest
- 19, // 23: nvidia.nvsentinel.v1alpha1.GpuService.UpdateGpu:input_type -> nvidia.nvsentinel.v1alpha1.UpdateGpuRequest
- 20, // 24: nvidia.nvsentinel.v1alpha1.GpuService.DeleteGpu:input_type -> nvidia.nvsentinel.v1alpha1.DeleteGpuRequest
- 13, // 25: nvidia.nvsentinel.v1alpha1.GpuService.GetGpu:output_type -> nvidia.nvsentinel.v1alpha1.GetGpuResponse
- 15, // 26: nvidia.nvsentinel.v1alpha1.GpuService.ListGpus:output_type -> nvidia.nvsentinel.v1alpha1.ListGpusResponse
- 17, // 27: nvidia.nvsentinel.v1alpha1.GpuService.WatchGpus:output_type -> nvidia.nvsentinel.v1alpha1.WatchGpusResponse
- 7, // 28: nvidia.nvsentinel.v1alpha1.GpuService.CreateGpu:output_type -> nvidia.nvsentinel.v1alpha1.Gpu
- 7, // 29: nvidia.nvsentinel.v1alpha1.GpuService.UpdateGpu:output_type -> nvidia.nvsentinel.v1alpha1.Gpu
- 22, // 30: nvidia.nvsentinel.v1alpha1.GpuService.DeleteGpu:output_type -> google.protobuf.Empty
- 25, // [25:31] is the sub-list for method output_type
- 19, // [19:25] is the sub-list for method input_type
- 19, // [19:19] is the sub-list for extension type_name
- 19, // [19:19] is the sub-list for extension extendee
- 0, // [0:19] is the sub-list for field type_name
+ 7, // 18: nvidia.nvsentinel.v1alpha1.UpdateGpuStatusRequest.gpu:type_name -> nvidia.nvsentinel.v1alpha1.Gpu
+ 5, // 19: nvidia.nvsentinel.v1alpha1.UpdateGpuStatusRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.UpdateOptions
+ 6, // 20: nvidia.nvsentinel.v1alpha1.DeleteGpuRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.DeleteOptions
+ 12, // 21: nvidia.nvsentinel.v1alpha1.GpuService.GetGpu:input_type -> nvidia.nvsentinel.v1alpha1.GetGpuRequest
+ 14, // 22: nvidia.nvsentinel.v1alpha1.GpuService.ListGpus:input_type -> nvidia.nvsentinel.v1alpha1.ListGpusRequest
+ 16, // 23: nvidia.nvsentinel.v1alpha1.GpuService.WatchGpus:input_type -> nvidia.nvsentinel.v1alpha1.WatchGpusRequest
+ 18, // 24: nvidia.nvsentinel.v1alpha1.GpuService.CreateGpu:input_type -> nvidia.nvsentinel.v1alpha1.CreateGpuRequest
+ 19, // 25: nvidia.nvsentinel.v1alpha1.GpuService.UpdateGpu:input_type -> nvidia.nvsentinel.v1alpha1.UpdateGpuRequest
+ 20, // 26: nvidia.nvsentinel.v1alpha1.GpuService.UpdateGpuStatus:input_type -> nvidia.nvsentinel.v1alpha1.UpdateGpuStatusRequest
+ 21, // 27: nvidia.nvsentinel.v1alpha1.GpuService.DeleteGpu:input_type -> nvidia.nvsentinel.v1alpha1.DeleteGpuRequest
+ 13, // 28: nvidia.nvsentinel.v1alpha1.GpuService.GetGpu:output_type -> nvidia.nvsentinel.v1alpha1.GetGpuResponse
+ 15, // 29: nvidia.nvsentinel.v1alpha1.GpuService.ListGpus:output_type -> nvidia.nvsentinel.v1alpha1.ListGpusResponse
+ 17, // 30: nvidia.nvsentinel.v1alpha1.GpuService.WatchGpus:output_type -> nvidia.nvsentinel.v1alpha1.WatchGpusResponse
+ 7, // 31: nvidia.nvsentinel.v1alpha1.GpuService.CreateGpu:output_type -> nvidia.nvsentinel.v1alpha1.Gpu
+ 7, // 32: nvidia.nvsentinel.v1alpha1.GpuService.UpdateGpu:output_type -> nvidia.nvsentinel.v1alpha1.Gpu
+ 7, // 33: nvidia.nvsentinel.v1alpha1.GpuService.UpdateGpuStatus:output_type -> nvidia.nvsentinel.v1alpha1.Gpu
+ 23, // 34: nvidia.nvsentinel.v1alpha1.GpuService.DeleteGpu:output_type -> google.protobuf.Empty
+ 28, // [28:35] is the sub-list for method output_type
+ 21, // [21:28] is the sub-list for method input_type
+ 21, // [21:21] is the sub-list for extension type_name
+ 21, // [21:21] is the sub-list for extension extendee
+ 0, // [0:21] is the sub-list for field type_name
}
func init() { file_device_v1alpha1_gpu_proto_init() }
@@ -1408,7 +1475,7 @@ func file_device_v1alpha1_gpu_proto_init() {
GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
RawDescriptor: unsafe.Slice(unsafe.StringData(file_device_v1alpha1_gpu_proto_rawDesc), len(file_device_v1alpha1_gpu_proto_rawDesc)),
NumEnums: 0,
- NumMessages: 21,
+ NumMessages: 22,
NumExtensions: 0,
NumServices: 1,
},
diff --git a/internal/generated/device/v1alpha1/gpu_grpc.pb.go b/internal/generated/device/v1alpha1/gpu_grpc.pb.go
index c31f32a56..2590d7ca7 100644
--- a/internal/generated/device/v1alpha1/gpu_grpc.pb.go
+++ b/internal/generated/device/v1alpha1/gpu_grpc.pb.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
// versions:
// - protoc-gen-go-grpc v1.5.1
-// - protoc v6.33.0
+// - protoc v6.33.4
// source: device/v1alpha1/gpu.proto
package v1alpha1
@@ -34,12 +34,13 @@ import (
const _ = grpc.SupportPackageIsVersion9
const (
- GpuService_GetGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/GetGpu"
- GpuService_ListGpus_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/ListGpus"
- GpuService_WatchGpus_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/WatchGpus"
- GpuService_CreateGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/CreateGpu"
- GpuService_UpdateGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/UpdateGpu"
- GpuService_DeleteGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/DeleteGpu"
+ GpuService_GetGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/GetGpu"
+ GpuService_ListGpus_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/ListGpus"
+ GpuService_WatchGpus_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/WatchGpus"
+ GpuService_CreateGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/CreateGpu"
+ GpuService_UpdateGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/UpdateGpu"
+ GpuService_UpdateGpuStatus_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/UpdateGpuStatus"
+ GpuService_DeleteGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/DeleteGpu"
)
// GpuServiceClient is the client API for GpuService service.
@@ -58,6 +59,8 @@ type GpuServiceClient interface {
CreateGpu(ctx context.Context, in *CreateGpuRequest, opts ...grpc.CallOption) (*Gpu, error)
// UpdateGpu updates a single GPU resource.
UpdateGpu(ctx context.Context, in *UpdateGpuRequest, opts ...grpc.CallOption) (*Gpu, error)
+ // UpdateGpuStatus updates only the status subresource of a GPU.
+ UpdateGpuStatus(ctx context.Context, in *UpdateGpuStatusRequest, opts ...grpc.CallOption) (*Gpu, error)
// DeleteGpu deletes a single GPU resource.
DeleteGpu(ctx context.Context, in *DeleteGpuRequest, opts ...grpc.CallOption) (*emptypb.Empty, error)
}
@@ -129,6 +132,16 @@ func (c *gpuServiceClient) UpdateGpu(ctx context.Context, in *UpdateGpuRequest,
return out, nil
}
+func (c *gpuServiceClient) UpdateGpuStatus(ctx context.Context, in *UpdateGpuStatusRequest, opts ...grpc.CallOption) (*Gpu, error) {
+ cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
+ out := new(Gpu)
+ err := c.cc.Invoke(ctx, GpuService_UpdateGpuStatus_FullMethodName, in, out, cOpts...)
+ if err != nil {
+ return nil, err
+ }
+ return out, nil
+}
+
func (c *gpuServiceClient) DeleteGpu(ctx context.Context, in *DeleteGpuRequest, opts ...grpc.CallOption) (*emptypb.Empty, error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
out := new(emptypb.Empty)
@@ -155,6 +168,8 @@ type GpuServiceServer interface {
CreateGpu(context.Context, *CreateGpuRequest) (*Gpu, error)
// UpdateGpu updates a single GPU resource.
UpdateGpu(context.Context, *UpdateGpuRequest) (*Gpu, error)
+ // UpdateGpuStatus updates only the status subresource of a GPU.
+ UpdateGpuStatus(context.Context, *UpdateGpuStatusRequest) (*Gpu, error)
// DeleteGpu deletes a single GPU resource.
DeleteGpu(context.Context, *DeleteGpuRequest) (*emptypb.Empty, error)
mustEmbedUnimplementedGpuServiceServer()
@@ -182,6 +197,9 @@ func (UnimplementedGpuServiceServer) CreateGpu(context.Context, *CreateGpuReques
func (UnimplementedGpuServiceServer) UpdateGpu(context.Context, *UpdateGpuRequest) (*Gpu, error) {
return nil, status.Errorf(codes.Unimplemented, "method UpdateGpu not implemented")
}
+func (UnimplementedGpuServiceServer) UpdateGpuStatus(context.Context, *UpdateGpuStatusRequest) (*Gpu, error) {
+ return nil, status.Errorf(codes.Unimplemented, "method UpdateGpuStatus not implemented")
+}
func (UnimplementedGpuServiceServer) DeleteGpu(context.Context, *DeleteGpuRequest) (*emptypb.Empty, error) {
return nil, status.Errorf(codes.Unimplemented, "method DeleteGpu not implemented")
}
@@ -289,6 +307,24 @@ func _GpuService_UpdateGpu_Handler(srv interface{}, ctx context.Context, dec fun
return interceptor(ctx, in, info, handler)
}
+func _GpuService_UpdateGpuStatus_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+ in := new(UpdateGpuStatusRequest)
+ if err := dec(in); err != nil {
+ return nil, err
+ }
+ if interceptor == nil {
+ return srv.(GpuServiceServer).UpdateGpuStatus(ctx, in)
+ }
+ info := &grpc.UnaryServerInfo{
+ Server: srv,
+ FullMethod: GpuService_UpdateGpuStatus_FullMethodName,
+ }
+ handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+ return srv.(GpuServiceServer).UpdateGpuStatus(ctx, req.(*UpdateGpuStatusRequest))
+ }
+ return interceptor(ctx, in, info, handler)
+}
+
func _GpuService_DeleteGpu_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
in := new(DeleteGpuRequest)
if err := dec(in); err != nil {
@@ -330,6 +366,10 @@ var GpuService_ServiceDesc = grpc.ServiceDesc{
MethodName: "UpdateGpu",
Handler: _GpuService_UpdateGpu_Handler,
},
+ {
+ MethodName: "UpdateGpuStatus",
+ Handler: _GpuService_UpdateGpuStatus_Handler,
+ },
{
MethodName: "DeleteGpu",
Handler: _GpuService_DeleteGpu_Handler,
diff --git a/pkg/client-go/client/versioned/clientset.go b/pkg/client-go/client/versioned/clientset.go
index 0779de3d7..6a7505817 100644
--- a/pkg/client-go/client/versioned/clientset.go
+++ b/pkg/client-go/client/versioned/clientset.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/client/versioned/fake/clientset_generated.go b/pkg/client-go/client/versioned/fake/clientset_generated.go
index e0118c0f8..71cea6c1b 100644
--- a/pkg/client-go/client/versioned/fake/clientset_generated.go
+++ b/pkg/client-go/client/versioned/fake/clientset_generated.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/client/versioned/fake/doc.go b/pkg/client-go/client/versioned/fake/doc.go
index 44b048c89..f6c7d06f0 100644
--- a/pkg/client-go/client/versioned/fake/doc.go
+++ b/pkg/client-go/client/versioned/fake/doc.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/client/versioned/fake/register.go b/pkg/client-go/client/versioned/fake/register.go
index 1573cb4f7..a2d9f7802 100644
--- a/pkg/client-go/client/versioned/fake/register.go
+++ b/pkg/client-go/client/versioned/fake/register.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/client/versioned/scheme/doc.go b/pkg/client-go/client/versioned/scheme/doc.go
index 55f52dc51..a3f9c58bc 100644
--- a/pkg/client-go/client/versioned/scheme/doc.go
+++ b/pkg/client-go/client/versioned/scheme/doc.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/client/versioned/scheme/register.go b/pkg/client-go/client/versioned/scheme/register.go
index 97cf5a8ff..46045b406 100644
--- a/pkg/client-go/client/versioned/scheme/register.go
+++ b/pkg/client-go/client/versioned/scheme/register.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/device_client.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/device_client.go
index 01b63b877..d2ca86aa3 100644
--- a/pkg/client-go/client/versioned/typed/device/v1alpha1/device_client.go
+++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/device_client.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/doc.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/doc.go
index 7749c1800..c689ab840 100644
--- a/pkg/client-go/client/versioned/typed/device/v1alpha1/doc.go
+++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/doc.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/doc.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/doc.go
index 2702a5453..942a10f72 100644
--- a/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/doc.go
+++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/doc.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_device_client.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_device_client.go
index 32c7c5401..5bd437c2c 100644
--- a/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_device_client.go
+++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_device_client.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_gpu.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_gpu.go
index 192da6fa1..e68564670 100644
--- a/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_gpu.go
+++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_gpu.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -110,6 +110,15 @@ func (c *fakeGPUs) Update(ctx context.Context, gPU *devicev1alpha1.GPU, opts v1.
return obj.(*devicev1alpha1.GPU), err
}
+func (c *fakeGPUs) UpdateStatus(ctx context.Context, gPU *devicev1alpha1.GPU, opts v1.UpdateOptions) (*devicev1alpha1.GPU, error) {
+ obj, err := c.Fake.
+ Invokes(testing.NewRootUpdateSubresourceActionWithOptions(c.Resource(), "status", gPU, opts), &devicev1alpha1.GPU{})
+ if obj == nil {
+ return nil, err
+ }
+ return obj.(*devicev1alpha1.GPU), err
+}
+
// Delete takes name of the gPU and deletes it. Returns an error if one occurs.
func (c *fakeGPUs) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error {
_, err := c.Fake.
diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/generated_expansion.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/generated_expansion.go
index c99bbb48c..97d724146 100644
--- a/pkg/client-go/client/versioned/typed/device/v1alpha1/generated_expansion.go
+++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/generated_expansion.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/gpu.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/gpu.go
index 4328d58a5..734754200 100644
--- a/pkg/client-go/client/versioned/typed/device/v1alpha1/gpu.go
+++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/gpu.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -38,6 +38,7 @@ type GPUsGetter interface {
type GPUInterface interface {
Create(ctx context.Context, gPU *devicev1alpha1.GPU, opts v1.CreateOptions) (*devicev1alpha1.GPU, error)
Update(ctx context.Context, gPU *devicev1alpha1.GPU, opts v1.UpdateOptions) (*devicev1alpha1.GPU, error)
+ UpdateStatus(ctx context.Context, gPU *devicev1alpha1.GPU, opts v1.UpdateOptions) (*devicev1alpha1.GPU, error)
Delete(ctx context.Context, name string, opts v1.DeleteOptions) error
Get(ctx context.Context, name string, opts v1.GetOptions) (*devicev1alpha1.GPU, error)
List(ctx context.Context, opts v1.ListOptions) (*devicev1alpha1.GPUList, error)
@@ -191,6 +192,26 @@ func (c *gpus) Update(ctx context.Context, gpu *devicev1alpha1.GPU, opts v1.Upda
return obj, nil
}
+// UpdateStatus updates only the status subresource of a GPU.
+func (c *gpus) UpdateStatus(ctx context.Context, gpu *devicev1alpha1.GPU, opts v1.UpdateOptions) (*devicev1alpha1.GPU, error) {
+ resp, err := c.client.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{
+ Gpu: devicev1alpha1.ToProto(gpu),
+ Opts: &pb.UpdateOptions{},
+ })
+ if err != nil {
+ return nil, err
+ }
+
+ obj := devicev1alpha1.FromProto(resp)
+ c.logger.V(2).Info("Updated GPU status",
+ "name", obj.GetName(),
+ "namespace", c.getNamespace(),
+ "resource-version", obj.GetResourceVersion(),
+ )
+
+ return obj, nil
+}
+
// TODO: Implement DeleteOptions support.
func (c *gpus) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error {
_, err := c.client.DeleteGpu(ctx, &pb.DeleteGpuRequest{
diff --git a/pkg/client-go/informers/externalversions/device/interface.go b/pkg/client-go/informers/externalversions/device/interface.go
index 871a7d07f..702c09212 100644
--- a/pkg/client-go/informers/externalversions/device/interface.go
+++ b/pkg/client-go/informers/externalversions/device/interface.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/informers/externalversions/device/v1alpha1/gpu.go b/pkg/client-go/informers/externalversions/device/v1alpha1/gpu.go
index db5da81ac..b5f6f419f 100644
--- a/pkg/client-go/informers/externalversions/device/v1alpha1/gpu.go
+++ b/pkg/client-go/informers/externalversions/device/v1alpha1/gpu.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/informers/externalversions/device/v1alpha1/interface.go b/pkg/client-go/informers/externalversions/device/v1alpha1/interface.go
index f3921c8e3..68303b6eb 100644
--- a/pkg/client-go/informers/externalversions/device/v1alpha1/interface.go
+++ b/pkg/client-go/informers/externalversions/device/v1alpha1/interface.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/informers/externalversions/factory.go b/pkg/client-go/informers/externalversions/factory.go
index 296c50425..cbf2ef267 100644
--- a/pkg/client-go/informers/externalversions/factory.go
+++ b/pkg/client-go/informers/externalversions/factory.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/informers/externalversions/generic.go b/pkg/client-go/informers/externalversions/generic.go
index f8ccccacc..0382aab5b 100644
--- a/pkg/client-go/informers/externalversions/generic.go
+++ b/pkg/client-go/informers/externalversions/generic.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go b/pkg/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go
index 35543b30e..f63107c96 100644
--- a/pkg/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go
+++ b/pkg/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/listers/device/v1alpha1/expansion_generated.go b/pkg/client-go/listers/device/v1alpha1/expansion_generated.go
index 1aa65cee4..011529aa5 100644
--- a/pkg/client-go/listers/device/v1alpha1/expansion_generated.go
+++ b/pkg/client-go/listers/device/v1alpha1/expansion_generated.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/client-go/listers/device/v1alpha1/gpu.go b/pkg/client-go/listers/device/v1alpha1/gpu.go
index 709bd429f..2ea778590 100644
--- a/pkg/client-go/listers/device/v1alpha1/gpu.go
+++ b/pkg/client-go/listers/device/v1alpha1/gpu.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
diff --git a/pkg/controlplane/apiserver/config.go b/pkg/controlplane/apiserver/config.go
index bb3d8bff7..0fa090d3b 100644
--- a/pkg/controlplane/apiserver/config.go
+++ b/pkg/controlplane/apiserver/config.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ import (
"github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/metrics"
"github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/options"
"github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/registry"
- "github.com/nvidia/nvsentinel/pkg/util/version"
+ "github.com/nvidia/nvsentinel/pkg/version"
)
type Config struct {
diff --git a/pkg/controlplane/apiserver/metrics/metrics.go b/pkg/controlplane/apiserver/metrics/metrics.go
index 98056ec81..2618ebebc 100644
--- a/pkg/controlplane/apiserver/metrics/metrics.go
+++ b/pkg/controlplane/apiserver/metrics/metrics.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@ import (
"sync"
grpcprom "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus"
- "github.com/nvidia/nvsentinel/pkg/util/version"
+ "github.com/nvidia/nvsentinel/pkg/version"
"github.com/prometheus/client_golang/prometheus"
"google.golang.org/grpc"
"k8s.io/klog/v2"
@@ -31,14 +31,18 @@ type ServerMetrics struct {
Registry *prometheus.Registry
Collectors *grpcprom.ServerMetrics
ServiceHealthStatus *prometheus.GaugeVec
+ mu sync.Mutex
buildInfoLabels prometheus.Labels
registerOnce sync.Once
}
// WithBuildInfo populates the metadata labels used by the build_info metric.
+// Must be called before Register() and only from a single goroutine (typically during init).
func (m *ServerMetrics) WithBuildInfo(info version.Info) *ServerMetrics {
+ m.mu.Lock()
+ defer m.mu.Unlock()
m.buildInfoLabels = prometheus.Labels{
- "version": info.GitVersion,
+ "version": info.Version,
"revision": info.GitCommit,
"build_date": info.BuildDate,
"goversion": info.GoVersion,
@@ -79,11 +83,15 @@ func (m *ServerMetrics) Register() {
klog.ErrorS(err, "Failed to register service health metrics")
}
- if m.buildInfoLabels != nil {
+ m.mu.Lock()
+ labels := m.buildInfoLabels
+ m.mu.Unlock()
+
+ if labels != nil {
version := prometheus.NewGauge(prometheus.GaugeOpts{
Name: "device_apiserver_build_info",
Help: "Build information about the device-apiserver binary.",
- ConstLabels: m.buildInfoLabels,
+ ConstLabels: labels,
})
version.Set(1)
diff --git a/pkg/controlplane/apiserver/options/grpc/options.go b/pkg/controlplane/apiserver/options/grpc/options.go
index 238700c8b..ff46b4728 100644
--- a/pkg/controlplane/apiserver/options/grpc/options.go
+++ b/pkg/controlplane/apiserver/options/grpc/options.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -125,8 +125,6 @@ func (o *Options) Complete() (CompletedOptions, error) {
o.MinPingInterval = 5 * time.Second
}
- o.PermitWithoutStream = true
-
completed := completedOptions{
Options: *o,
}
@@ -197,12 +195,6 @@ func (o *Options) Validate() []error {
o.MinPingInterval))
}
- if !o.PermitWithoutStream {
- allErrors = append(allErrors,
- fmt.Errorf("permit-without-stream: %v must be true to allow keepalive pings without active streams",
- o.PermitWithoutStream))
- }
-
return allErrors
}
diff --git a/pkg/controlplane/apiserver/options/grpc/options_test.go b/pkg/controlplane/apiserver/options/grpc/options_test.go
index eb725f423..f39e52f4c 100644
--- a/pkg/controlplane/apiserver/options/grpc/options_test.go
+++ b/pkg/controlplane/apiserver/options/grpc/options_test.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -75,9 +75,8 @@ func TestComplete(t *testing.T) {
if completed.MaxRecvMsgSize != 4194304 {
t.Errorf("expected default recv size 4MiB, got %d", completed.MaxRecvMsgSize)
}
- if !completed.PermitWithoutStream {
- t.Error("PermitWithoutStream should be forced to true")
- }
+ // PermitWithoutStream defaults to true via NewOptions(), not forced by Complete().
+ // A zero-value Options{} will have PermitWithoutStream=false since there is no flag for it.
})
t.Run("Preserve user overrides", func(t *testing.T) {
diff --git a/pkg/controlplane/apiserver/options/options.go b/pkg/controlplane/apiserver/options/options.go
index 113523ef5..c6b5b9470 100644
--- a/pkg/controlplane/apiserver/options/options.go
+++ b/pkg/controlplane/apiserver/options/options.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@ package options
import (
"context"
"fmt"
+ "net"
"os"
"strings"
"time"
@@ -116,6 +117,8 @@ func (o *Options) Complete(ctx context.Context) (CompletedOptions, error) {
o.NodeName = strings.ToLower(strings.TrimSpace(o.NodeName)) //nolint:wsl
if o.HealthAddress == "" {
+ // Default binds to all interfaces for Kubernetes kubelet health probes.
+ // Use NetworkPolicy to restrict access in production.
o.HealthAddress = ":50051"
}
@@ -124,6 +127,8 @@ func (o *Options) Complete(ctx context.Context) (CompletedOptions, error) {
}
if o.MetricsAddress == "" {
+ // Default binds to all interfaces for Prometheus scraping.
+ // Use NetworkPolicy to restrict access in production.
o.MetricsAddress = ":9090"
}
@@ -203,10 +208,15 @@ func (o *CompletedOptions) Validate() []error {
}
}
- if o.HealthAddress != "" && o.HealthAddress == o.MetricsAddress {
- allErrors = append(allErrors,
- fmt.Errorf("health-probe-bind-address and metrics-bind-address: must not be the same (%s)",
- o.HealthAddress))
+ if o.HealthAddress != "" && o.MetricsAddress != "" {
+ _, healthPort, _ := net.SplitHostPort(o.HealthAddress)
+ _, metricsPort, _ := net.SplitHostPort(o.MetricsAddress)
+
+ if healthPort != "" && healthPort == metricsPort {
+ allErrors = append(allErrors,
+ fmt.Errorf("health-probe-bind-address and metrics-bind-address: must not use the same port (%s)",
+ healthPort))
+ }
}
if o.ShutdownGracePeriod < 0 {
diff --git a/pkg/controlplane/apiserver/options/options_test.go b/pkg/controlplane/apiserver/options/options_test.go
index 0f905a6bc..1108e9f7e 100644
--- a/pkg/controlplane/apiserver/options/options_test.go
+++ b/pkg/controlplane/apiserver/options/options_test.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -158,7 +158,7 @@ func TestValidate(t *testing.T) {
o.MetricsAddress = ":8080"
},
wantErr: true,
- errContains: "must not be the same (:8080)",
+ errContains: "must not use the same port (8080)",
},
{
name: "Negative service monitor period",
diff --git a/pkg/controlplane/apiserver/server.go b/pkg/controlplane/apiserver/server.go
index a7a616aa9..41ecdb00a 100644
--- a/pkg/controlplane/apiserver/server.go
+++ b/pkg/controlplane/apiserver/server.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ import (
"github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/metrics"
"github.com/nvidia/nvsentinel/pkg/storage/storagebackend"
netutils "github.com/nvidia/nvsentinel/pkg/util/net"
- "github.com/nvidia/nvsentinel/pkg/util/version"
+ "github.com/nvidia/nvsentinel/pkg/version"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"google.golang.org/grpc"
@@ -36,6 +36,7 @@ import (
"google.golang.org/grpc/health"
healthpb "google.golang.org/grpc/health/grpc_health_v1"
"google.golang.org/grpc/reflection"
+ "k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
)
@@ -51,7 +52,6 @@ type DeviceAPIServer struct {
AdminServer *grpc.Server
AdminCleanup func()
Metrics *metrics.ServerMetrics
- MetricsRegistry *prometheus.Registry
Storage *storagebackend.Storage
ServiceProviders []api.ServiceProvider
mu sync.RWMutex
@@ -92,9 +92,16 @@ func (s *DeviceAPIServer) PrepareRun(ctx context.Context) (preparedDeviceAPIServ
if s.HealthAddress != "" {
s.HealthServer = health.NewServer()
healthpb.RegisterHealthServer(s.AdminServer, s.HealthServer)
+ // Also register on DeviceServer so sidecar providers connecting via
+ // unix socket can perform health checks without a separate connection.
+ healthpb.RegisterHealthServer(s.DeviceServer, s.HealthServer)
s.HealthServer.SetServingStatus("", healthpb.HealthCheckResponse_NOT_SERVING)
}
+ // Enable gRPC reflection on both servers. This is intentional:
+ // - DeviceServer: allows grpcurl/grpc_cli debugging
+ // - AdminServer: required for channelz and admin tooling
+ // To restrict in production, use NetworkPolicy on the admin port.
reflection.Register(s.DeviceServer)
reflection.Register(s.AdminServer)
@@ -139,13 +146,27 @@ func (s *DeviceAPIServer) run(ctx context.Context) error {
go func() {
defer s.wg.Done()
+ defer func() {
+ if r := recover(); r != nil {
+ klog.ErrorS(nil, "Health monitor panicked, setting NOT_SERVING", "panic", r)
+
+ if s.HealthServer != nil {
+ s.HealthServer.SetServingStatus("", healthpb.HealthCheckResponse_NOT_SERVING)
+ }
+ }
+ }()
+
s.monitorServiceHealth(ctx)
}()
}
if s.MetricsAddress != "" {
- // TODO: put in wg??
- go s.serveMetrics(ctx)
+ s.wg.Add(1)
+ go func() {
+ defer s.wg.Done()
+
+ s.serveMetrics(ctx)
+ }()
}
if err := s.waitForStorage(ctx); err != nil {
@@ -174,7 +195,18 @@ func (s *DeviceAPIServer) run(ctx context.Context) error {
s.DeviceServer.GracefulStop()
if s.AdminServer != nil {
- s.AdminServer.GracefulStop()
+ adminDone := make(chan struct{})
+ go func() {
+ s.AdminServer.GracefulStop()
+ close(adminDone)
+ }()
+
+ select {
+ case <-adminDone:
+ case <-time.After(s.ShutdownGracePeriod):
+ logger.V(2).Info("AdminServer graceful stop timed out, forcing stop")
+ s.AdminServer.Stop()
+ }
}
if s.AdminCleanup != nil {
@@ -214,14 +246,17 @@ func (s *DeviceAPIServer) serveHealth(ctx context.Context) {
// to unblock Serve and reject new conns.
go func() {
<-ctx.Done()
- lis.Close()
+
+ if err := lis.Close(); err != nil {
+ logger.Error(err, "Failed to close health listener", "address", s.HealthAddress)
+ }
}()
logger.V(2).Info("Starting health server", "address", s.HealthAddress)
serveErr := s.AdminServer.Serve(lis)
if serveErr != nil && !errors.Is(serveErr, grpc.ErrServerStopped) && !errors.Is(serveErr, net.ErrClosed) {
- logger.Error(err, "Health server stopped unexpectedly")
+ logger.Error(serveErr, "Health server stopped unexpectedly")
}
}
@@ -268,7 +303,7 @@ func (s *DeviceAPIServer) serveMetrics(ctx context.Context) {
serveErr := metricsSrv.Serve(lis)
if serveErr != nil && !errors.Is(serveErr, http.ErrServerClosed) && !errors.Is(serveErr, net.ErrClosed) {
- logger.Error(err, "Metrics server stopped unexpectedly", "address", s.MetricsAddress)
+ logger.Error(serveErr, "Metrics server stopped unexpectedly", "address", s.MetricsAddress)
}
}
@@ -277,48 +312,40 @@ func (s *DeviceAPIServer) waitForStorage(ctx context.Context) error {
return fmt.Errorf("storage backend is not initialized")
}
- logger := klog.FromContext(ctx)
- startTime := time.Now()
-
if s.Storage.IsReady() {
return nil
}
- pollTicker := time.NewTicker(200 * time.Millisecond)
- defer pollTicker.Stop()
-
- heartbeat := time.NewTicker(5 * time.Second)
- defer heartbeat.Stop()
-
+ logger := klog.FromContext(ctx)
logger.Info("Waiting for storage backend to become ready")
+ startTime := time.Now()
- for {
- select {
- case <-ctx.Done():
- return ctx.Err()
-
- case <-pollTicker.C:
+ err := wait.PollUntilContextTimeout(ctx, 200*time.Millisecond, 60*time.Second, true,
+ func(ctx context.Context) (bool, error) {
if s.Storage.IsReady() {
logger.V(2).Info("Storage backend is ready",
"duration", time.Since(startTime).Round(time.Second))
- return nil
+ return true, nil
}
- case <-heartbeat.C:
- logger.V(2).Info("Still waiting for storage backend",
- "elapsed", time.Since(startTime).Round(time.Second))
- }
+ return false, nil
+ },
+ )
+ if err != nil {
+ return fmt.Errorf("timed out waiting for storage backend readiness: %w", err)
}
+
+ return nil
}
func (s *DeviceAPIServer) installAPIServices(ctx context.Context) error {
logger := klog.FromContext(ctx)
var services []api.Service
- for _, sp := range s.ServiceProviders {
+ for i, sp := range s.ServiceProviders {
service, err := sp.Install(s.DeviceServer, s.Storage.StorageConfig)
if err != nil {
- return fmt.Errorf("failed to install API service: %w", err)
+ return fmt.Errorf("failed to install API service (index %d): %w", i, err)
}
services = append(services, service)
diff --git a/pkg/grpc/client/client_conn.go b/pkg/grpc/client/client_conn.go
index 1563e6d6a..5a19b3810 100644
--- a/pkg/grpc/client/client_conn.go
+++ b/pkg/grpc/client/client_conn.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@ package client
import (
"fmt"
+ "strings"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
@@ -39,6 +40,15 @@ func ClientConnFor(config *Config, opts ...DialOption) (*grpc.ClientConn, error)
return nil, err
}
+ // Insecure credentials are only safe over Unix domain sockets.
+ // TLS is required for non-UDS targets (dns:, passthrough:).
+ if !strings.HasPrefix(cfg.Target, "unix://") && !strings.HasPrefix(cfg.Target, "unix:") {
+ return nil, fmt.Errorf(
+ "insecure credentials require unix:// target, got %q; TLS is required for non-UDS targets",
+ cfg.Target,
+ )
+ }
+
logger := cfg.GetLogger()
grpcOpts := []grpc.DialOption{
diff --git a/pkg/grpc/client/client_conn_test.go b/pkg/grpc/client/client_conn_test.go
index 18f9d1864..b8589fad4 100644
--- a/pkg/grpc/client/client_conn_test.go
+++ b/pkg/grpc/client/client_conn_test.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
package client
import (
+ "strings"
"testing"
"github.com/go-logr/logr"
@@ -54,4 +55,18 @@ func TestClientConnFor(t *testing.T) {
}
conn.Close()
})
+
+ t.Run("Rejects non-unix target with insecure credentials", func(t *testing.T) {
+ cfg := &Config{
+ Target: "dns:///localhost:8080",
+ UserAgent: "test/1.0",
+ }
+ _, err := ClientConnFor(cfg)
+ if err == nil {
+ t.Fatal("expected error for non-unix target with insecure credentials")
+ }
+ if !strings.Contains(err.Error(), "insecure credentials require unix://") {
+ t.Errorf("unexpected error message: %v", err)
+ }
+ })
}
diff --git a/pkg/grpc/client/config.go b/pkg/grpc/client/config.go
index 308e72bef..1697845df 100644
--- a/pkg/grpc/client/config.go
+++ b/pkg/grpc/client/config.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -17,10 +17,11 @@ package client
import (
"fmt"
"os"
+ "strings"
"time"
"github.com/go-logr/logr"
- "github.com/nvidia/nvsentinel/pkg/util/version"
+ "github.com/nvidia/nvsentinel/pkg/version"
)
const (
@@ -73,6 +74,12 @@ func (c *Config) Validate() error {
return fmt.Errorf("gRPC target address is required; verify %s is not empty", NvidiaDeviceAPITargetEnvVar)
}
+ // Validate target scheme
+ if !strings.HasPrefix(c.Target, "unix://") && !strings.HasPrefix(c.Target, "unix:") &&
+ !strings.HasPrefix(c.Target, "dns:") && !strings.HasPrefix(c.Target, "passthrough:") {
+ return fmt.Errorf("gRPC target %q must use unix://, dns:, or passthrough: scheme", c.Target)
+ }
+
if c.UserAgent == "" {
return fmt.Errorf("user-agent cannot be empty")
}
diff --git a/pkg/grpc/client/config_test.go b/pkg/grpc/client/config_test.go
index 8cb550ed3..048b54e13 100644
--- a/pkg/grpc/client/config_test.go
+++ b/pkg/grpc/client/config_test.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -86,13 +86,53 @@ func TestConfig_Validate(t *testing.T) {
wantErr bool
}{
{
- name: "Valid config",
+ name: "Valid unix:/// config",
cfg: Config{
Target: "unix:///var/run/test.sock",
UserAgent: "test/1.0",
},
wantErr: false,
},
+ {
+ name: "Valid unix: config",
+ cfg: Config{
+ Target: "unix:/var/run/test.sock",
+ UserAgent: "test/1.0",
+ },
+ wantErr: false,
+ },
+ {
+ name: "Valid dns: config",
+ cfg: Config{
+ Target: "dns:///localhost:8080",
+ UserAgent: "test/1.0",
+ },
+ wantErr: false,
+ },
+ {
+ name: "Valid passthrough: config",
+ cfg: Config{
+ Target: "passthrough:///localhost:8080",
+ UserAgent: "test/1.0",
+ },
+ wantErr: false,
+ },
+ {
+ name: "Rejects http scheme",
+ cfg: Config{
+ Target: "http://evil.com",
+ UserAgent: "test/1.0",
+ },
+ wantErr: true,
+ },
+ {
+ name: "Rejects bare hostname",
+ cfg: Config{
+ Target: "somehost:1234",
+ UserAgent: "test/1.0",
+ },
+ wantErr: true,
+ },
{
name: "Missing target",
cfg: Config{
diff --git a/pkg/grpc/client/interceptors.go b/pkg/grpc/client/interceptors.go
index 796a34e50..c8e9e391c 100644
--- a/pkg/grpc/client/interceptors.go
+++ b/pkg/grpc/client/interceptors.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -46,7 +46,8 @@ func NewLatencyUnaryInterceptor(logger logr.Logger) grpc.UnaryClientInterceptor
return err
}
- logger.Error(err, "RPC failed", kv...)
+ logger.V(4).Info("RPC error details", "error", err)
+ logger.Error(nil, "RPC failed", kv...)
return err
}
@@ -81,7 +82,8 @@ func NewLatencyStreamInterceptor(logger logr.Logger) grpc.StreamClientIntercepto
return stream, err
}
- logger.Error(err, "Stream establishment failed", kv...)
+ logger.V(4).Info("Stream error details", "error", err)
+ logger.Error(nil, "Stream establishment failed", kv...)
return stream, err
}
diff --git a/pkg/grpc/client/watcher.go b/pkg/grpc/client/watcher.go
index 5972ef536..f688d550e 100644
--- a/pkg/grpc/client/watcher.go
+++ b/pkg/grpc/client/watcher.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -17,8 +17,10 @@ package client
import (
"context"
"errors"
+ "fmt"
"io"
"sync"
+ "time"
"github.com/go-logr/logr"
"google.golang.org/grpc/codes"
@@ -64,17 +66,16 @@ func NewWatcher(
return w
}
-// Stop cancels the context and closes the event source.
+// Stop signals the receive loop to exit, cancels the context, and closes the event source.
func (w *Watcher) Stop() {
w.stopOnce.Do(func() {
w.logger.V(4).Info("Stopping watcher")
- w.cancel()
+ close(w.done) // Signal receive loop to exit first
+ w.cancel() // Cancel the context
if err := w.source.Close(); err != nil {
w.logger.V(4).Info("Error closing source during stop", "err", err)
}
-
- close(w.done)
})
}
@@ -125,7 +126,7 @@ func (w *Watcher) receive() {
return
default:
- w.logger.V(2).Info("Skipping unknown event type from server", "rawType", typeStr)
+ w.logger.V(1).Info("Skipping unknown event type from server", "rawType", typeStr)
continue
}
@@ -141,17 +142,26 @@ func (w *Watcher) receive() {
"resourceVersion", meta.GetResourceVersion(),
)
}
+ case <-time.After(30 * time.Second):
+ w.logger.Error(nil, "Event send timed out; consumer not reading, stopping watcher")
+ return
}
}
}
func (w *Watcher) sendError(err error) {
st := status.Convert(err)
-
code := st.Code()
+
+ // Log full error details at debug level only
+ w.logger.V(4).Info("Watch stream error",
+ "code", code,
+ "serverMessage", st.Message(),
+ )
+
statusErr := &metav1.Status{
Status: metav1.StatusFailure,
- Message: st.Message(),
+ Message: fmt.Sprintf("watch stream error: %s", code.String()),
Code: int32(code), // #nosec G115
}
@@ -181,5 +191,7 @@ func (w *Watcher) sendError(err error) {
case <-w.done:
w.logger.V(4).Info("Watcher already done, dropping error event")
case w.result <- watch.Event{Type: watch.Error, Object: statusErr}:
+ case <-time.After(5 * time.Second):
+ w.logger.V(2).Info("Error event send timed out, dropping")
}
}
diff --git a/pkg/providers/nvml/enumerator.go b/pkg/providers/nvml/enumerator.go
new file mode 100644
index 000000000..f1ac61b38
--- /dev/null
+++ b/pkg/providers/nvml/enumerator.go
@@ -0,0 +1,199 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build nvml
+
+package nvml
+
+import (
+ "fmt"
+ "time"
+
+ "github.com/NVIDIA/go-nvml/pkg/nvml"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+ devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1"
+)
+
+// enumerateDevices discovers all GPUs via NVML and registers them via gRPC.
+//
+// For each GPU found, it extracts device information and creates a GPU entry
+// via the GpuService API with an initial "NVMLReady" condition set to True.
+//
+// Returns the number of GPUs discovered.
+func (p *Provider) enumerateDevices() (int, error) {
+ count, ret := p.nvmllib.DeviceGetCount()
+ if ret != nvml.SUCCESS {
+ return 0, fmt.Errorf("failed to get device count: %v", nvml.ErrorString(ret))
+ }
+
+ if count == 0 {
+ p.logger.Info("No GPUs found on this node")
+ return 0, nil
+ }
+
+ p.logger.V(1).Info("Enumerating GPUs", "count", count)
+
+ successCount := 0
+ uuids := make([]string, 0, count)
+
+ for i := 0; i < count; i++ {
+ device, ret := p.nvmllib.DeviceGetHandleByIndex(i)
+ if ret != nvml.SUCCESS {
+ p.logger.Error(nil, "Failed to get device handle", "index", i, "error", nvml.ErrorString(ret))
+
+ continue
+ }
+
+ gpu, productName, memoryBytes, err := p.deviceToGpu(i, device)
+ if err != nil {
+ p.logger.Error(err, "Failed to get GPU info", "index", i)
+
+ continue
+ }
+
+ // Register GPU via typed client (Create is idempotent -- returns existing GPU if already registered)
+ _, err = p.client.Create(p.ctx, gpu, metav1.CreateOptions{})
+ if err != nil {
+ p.logger.Error(err, "Failed to create GPU via gRPC", "uuid", gpu.Name)
+
+ continue
+ }
+
+ // Track UUID for health monitoring
+ uuids = append(uuids, gpu.Name)
+
+ p.logger.Info("GPU registered",
+ "uuid", gpu.Name,
+ "productName", productName,
+ "memory", FormatBytes(memoryBytes),
+ )
+
+ successCount++
+ }
+
+ // Assign tracked UUIDs atomically (caller holds p.mu)
+ p.gpuUUIDs = uuids
+
+ return successCount, nil
+}
+
+// deviceToGpu extracts GPU information from an NVML device handle.
+// Returns the GPU object, product name, and memory bytes (for logging).
+func (p *Provider) deviceToGpu(index int, device Device) (*devicev1alpha1.GPU, string, uint64, error) {
+ // Get UUID (required)
+ uuid, ret := device.GetUUID()
+ if ret != nvml.SUCCESS {
+ return nil, "", 0, fmt.Errorf("failed to get UUID: %v", nvml.ErrorString(ret))
+ }
+
+ // Get memory info (for logging)
+ var memoryBytes uint64
+
+ memInfo, ret := device.GetMemoryInfo()
+ if ret == nvml.SUCCESS {
+ memoryBytes = memInfo.Total
+ }
+
+ // Get product name (for logging)
+ productName, ret := device.GetName()
+ if ret != nvml.SUCCESS {
+ productName = "Unknown"
+ }
+
+ // Build GPU object using K8s-native types
+ now := metav1.Now()
+ gpu := &devicev1alpha1.GPU{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: uuid,
+ },
+ Spec: devicev1alpha1.GPUSpec{
+ UUID: uuid,
+ },
+ Status: devicev1alpha1.GPUStatus{
+ Conditions: []metav1.Condition{
+ {
+ Type: ConditionTypeNVMLReady,
+ Status: metav1.ConditionStatus(ConditionStatusTrue),
+ Reason: "Initialized",
+ Message: fmt.Sprintf("GPU enumerated via NVML: %s (%s)", productName, FormatBytes(memoryBytes)),
+ LastTransitionTime: now,
+ },
+ },
+ },
+ }
+
+ return gpu, productName, memoryBytes, nil
+}
+
+// UpdateCondition updates a single condition on a GPU via the typed client.
+//
+// This method:
+// 1. Gets the current GPU state
+// 2. Updates/adds the condition in the status
+// 3. Sends the updated status via UpdateStatus (status subresource)
+//
+// The condition's LastTransitionTime is set to the current time.
+func (p *Provider) UpdateCondition(
+ uuid string,
+ conditionType string,
+ conditionStatus string,
+ reason, message string,
+) error {
+ // Get current GPU state
+ gpu, err := p.client.Get(p.ctx, uuid, metav1.GetOptions{})
+ if err != nil {
+ return fmt.Errorf("failed to get GPU %s: %w", uuid, err)
+ }
+
+ if gpu == nil {
+ return fmt.Errorf("Get returned nil for %s", uuid)
+ }
+
+ // Build the new condition
+ condition := metav1.Condition{
+ Type: conditionType,
+ Status: metav1.ConditionStatus(conditionStatus),
+ Reason: reason,
+ Message: message,
+ LastTransitionTime: metav1.NewTime(time.Now()),
+ }
+
+ // Find and replace existing condition, or append
+ found := false
+ for i, existing := range gpu.Status.Conditions {
+ if existing.Type == conditionType {
+ gpu.Status.Conditions[i] = condition
+ found = true
+ break
+ }
+ }
+ if !found {
+ gpu.Status.Conditions = append(gpu.Status.Conditions, condition)
+ }
+
+ // Cap conditions to prevent unbounded growth
+ const maxConditions = 100
+ if len(gpu.Status.Conditions) > maxConditions {
+ gpu.Status.Conditions = gpu.Status.Conditions[len(gpu.Status.Conditions)-maxConditions:]
+ }
+
+ // Update the GPU status via the status subresource
+ _, err = p.client.UpdateStatus(p.ctx, gpu, metav1.UpdateOptions{})
+ if err != nil {
+ return fmt.Errorf("failed to update GPU status %s: %w", uuid, err)
+ }
+
+ return nil
+}
diff --git a/pkg/providers/nvml/health_monitor.go b/pkg/providers/nvml/health_monitor.go
new file mode 100644
index 000000000..5169b3d79
--- /dev/null
+++ b/pkg/providers/nvml/health_monitor.go
@@ -0,0 +1,282 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build nvml
+
+package nvml
+
+import (
+ "fmt"
+ "time"
+
+ "github.com/NVIDIA/go-nvml/pkg/nvml"
+)
+
+// HealthMonitor monitors GPU health via NVML events.
+type HealthMonitor struct {
+ provider *Provider
+}
+
+// EventTimeout is the timeout for NVML event wait (in milliseconds).
+const EventTimeout = 5000
+
+// unknownUUID is used when UUID cannot be retrieved.
+const unknownUUID = "unknown"
+
+// startHealthMonitoring initializes and starts XID event monitoring.
+func (p *Provider) startHealthMonitoring() error {
+ // Create event set
+ eventSet, ret := p.nvmllib.EventSetCreate()
+ if ret != nvml.SUCCESS {
+ return fmt.Errorf("failed to create event set: %v", nvml.ErrorString(ret))
+ }
+
+ p.eventSet = eventSet
+
+ // Register for health events on all GPUs
+ eventMask := uint64(
+ nvml.EventTypeXidCriticalError |
+ nvml.EventTypeDoubleBitEccError |
+ nvml.EventTypeSingleBitEccError,
+ )
+
+ count, ret := p.nvmllib.DeviceGetCount()
+ if ret != nvml.SUCCESS {
+ _ = p.eventSet.Free()
+ p.eventSet = nil
+ return fmt.Errorf("failed to get device count for health monitoring: %v", nvml.ErrorString(ret))
+ }
+
+ registeredCount := 0
+
+ for i := 0; i < count; i++ {
+ device, ret := p.nvmllib.DeviceGetHandleByIndex(i)
+ if ret != nvml.SUCCESS {
+ continue
+ }
+
+ uuid, ret := device.GetUUID()
+ if ret != nvml.SUCCESS {
+ p.logger.V(1).Info("Failed to get device UUID for health monitoring, skipping",
+ "index", i,
+ "error", nvml.ErrorString(ret),
+ )
+ continue
+ }
+
+ // Get supported events for this device
+ supportedEvents, ret := device.GetSupportedEventTypes()
+ if ret != nvml.SUCCESS {
+ p.logger.V(1).Info("Device does not support event queries",
+ "index", i,
+ "uuid", uuid,
+ "error", nvml.ErrorString(ret),
+ )
+
+ continue
+ }
+
+ // Register only supported events
+ eventsToRegister := eventMask & supportedEvents
+ if eventsToRegister == 0 {
+ p.logger.V(1).Info("Device does not support any health events",
+ "index", i,
+ "uuid", uuid,
+ )
+
+ continue
+ }
+
+ ret = device.RegisterEvents(eventsToRegister, p.eventSet.Raw())
+ if ret == nvml.ERROR_NOT_SUPPORTED {
+ p.logger.V(1).Info("Device too old for health monitoring",
+ "index", i,
+ "uuid", uuid,
+ )
+
+ continue
+ }
+
+ if ret != nvml.SUCCESS {
+ p.logger.Error(nil, "Failed to register events",
+ "index", i,
+ "uuid", uuid,
+ "error", nvml.ErrorString(ret),
+ )
+
+ continue
+ }
+
+ registeredCount++
+
+ p.logger.V(2).Info("Registered health events",
+ "index", i,
+ "uuid", uuid,
+ "events", eventsToRegister,
+ )
+ }
+
+ if registeredCount == 0 {
+ _ = p.eventSet.Free()
+ p.eventSet = nil
+
+ return fmt.Errorf("no devices support health event monitoring")
+ }
+
+ p.logger.Info("Starting health monitoring", "devices", registeredCount)
+
+ // Create health monitor
+ p.healthMonitor = &HealthMonitor{provider: p}
+
+ // Start monitoring goroutine
+ p.wg.Add(1)
+
+ go p.runHealthMonitor()
+
+ p.monitorRunning = true
+
+ return nil
+}
+
+// runHealthMonitor is the main health monitoring loop.
+//
+// The loop checks for context cancellation before each iteration to ensure
+// prompt shutdown when requested. The processEvents() call blocks for up to
+// EventTimeout milliseconds waiting for NVML events.
+func (p *Provider) runHealthMonitor() {
+ defer p.wg.Done()
+
+ p.logger.V(1).Info("Health monitor started")
+
+ for {
+ // Check for shutdown before processing events.
+ // This ensures we respond promptly to cancellation rather than
+ // waiting for the next event timeout cycle.
+ select {
+ case <-p.ctx.Done():
+ p.logger.V(1).Info("Health monitor stopping")
+ return
+ default:
+ }
+
+ p.processEvents()
+ }
+}
+
+// processEvents waits for and processes NVML events.
+func (p *Provider) processEvents() {
+ event, ret := p.eventSet.Wait(EventTimeout)
+
+ if ret == nvml.ERROR_TIMEOUT {
+ // Normal timeout, continue
+ return
+ }
+
+ if ret != nvml.SUCCESS {
+ if ret == nvml.ERROR_GPU_IS_LOST {
+ p.logger.Error(nil, "GPU lost detected, marking all GPUs unhealthy")
+ p.markAllUnhealthy("GPULost", "GPU is lost error detected")
+
+ return
+ }
+
+ p.logger.V(2).Info("Error waiting for event",
+ "error", nvml.ErrorString(ret),
+ )
+
+ // Brief sleep to avoid tight loop on persistent errors
+ time.Sleep(100 * time.Millisecond)
+
+ return
+ }
+
+ // Process the event
+ p.handleEvent(event)
+}
+
+// handleEvent processes a single NVML event.
+func (p *Provider) handleEvent(event nvml.EventData) {
+ eventType := event.EventType
+ xid := event.EventData
+ gpuInstanceID := event.GpuInstanceId
+ computeInstanceID := event.ComputeInstanceId
+
+ // Get UUID for logging
+ uuid := unknownUUID
+
+ if event.Device != nil {
+ if u, ret := event.Device.GetUUID(); ret == nvml.SUCCESS {
+ uuid = u
+ }
+ }
+
+ // Only process XID critical errors for health changes
+ if eventType != nvml.EventTypeXidCriticalError {
+ p.logger.V(2).Info("Non-critical event received",
+ "uuid", uuid,
+ "eventType", eventType,
+ "xid", xid,
+ )
+
+ return
+ }
+
+ // Check if this XID should be ignored
+ if isIgnoredXid(xid, p.additionalIgnoredXids) {
+ p.logger.V(2).Info("Ignoring non-critical XID",
+ "uuid", uuid,
+ "xid", xid,
+ "gpuInstanceId", gpuInstanceID,
+ "computeInstanceId", computeInstanceID,
+ )
+
+ return
+ }
+
+ // Critical XID - mark GPU unhealthy
+ p.logger.Info("Critical XID error detected",
+ "uuid", uuid,
+ "xid", xid,
+ "xidName", xidToString(xid),
+ "gpuInstanceId", gpuInstanceID,
+ "computeInstanceId", computeInstanceID,
+ )
+
+ message := fmt.Sprintf("Critical XID error %d (%s) detected", xid, xidToString(xid))
+ if err := p.UpdateCondition(uuid, ConditionTypeNVMLReady, ConditionStatusFalse, "XidError", message); err != nil {
+ p.logger.Error(err, "Failed to update GPU condition", "uuid", uuid)
+ }
+}
+
+// markAllUnhealthy marks all tracked GPUs as unhealthy.
+func (p *Provider) markAllUnhealthy(reason, message string) {
+ p.mu.RLock()
+ uuids := make([]string, len(p.gpuUUIDs))
+ copy(uuids, p.gpuUUIDs)
+ p.mu.RUnlock()
+
+ for _, uuid := range uuids {
+ err := p.UpdateCondition(uuid, ConditionTypeNVMLReady, ConditionStatusFalse, reason, message)
+ if err != nil {
+ p.logger.Error(err, "Failed to mark GPU unhealthy", "uuid", uuid)
+ }
+ }
+}
+
+// MarkHealthy marks a specific GPU as healthy.
+//
+// This can be called to restore a GPU's health status after recovery.
+func (p *Provider) MarkHealthy(uuid string) error {
+ return p.UpdateCondition(uuid, ConditionTypeNVMLReady, ConditionStatusTrue, "Healthy", "GPU is healthy")
+}
diff --git a/pkg/providers/nvml/interface.go b/pkg/providers/nvml/interface.go
new file mode 100644
index 000000000..5b534b154
--- /dev/null
+++ b/pkg/providers/nvml/interface.go
@@ -0,0 +1,143 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build nvml
+
+package nvml
+
+import (
+ "github.com/NVIDIA/go-nvml/pkg/nvml"
+)
+
+// Library is the interface for NVML library operations.
+// This interface contains only the methods used by the Provider,
+// making it easier to mock for testing.
+type Library interface {
+ Init() nvml.Return
+ Shutdown() nvml.Return
+ SystemGetDriverVersion() (string, nvml.Return)
+ DeviceGetCount() (int, nvml.Return)
+ DeviceGetHandleByIndex(index int) (Device, nvml.Return)
+ EventSetCreate() (EventSet, nvml.Return)
+}
+
+// Device is the interface for NVML device operations.
+type Device interface {
+ GetUUID() (string, nvml.Return)
+ GetName() (string, nvml.Return)
+ GetMemoryInfo() (nvml.Memory, nvml.Return)
+ GetRetiredPagesPendingStatus() (nvml.EnableState, nvml.Return)
+ GetSupportedEventTypes() (uint64, nvml.Return)
+ RegisterEvents(eventTypes uint64, set nvml.EventSet) nvml.Return
+}
+
+// EventSet is the interface for NVML event set operations.
+type EventSet interface {
+ Wait(timeout uint32) (nvml.EventData, nvml.Return)
+ Free() nvml.Return
+ // Raw returns the underlying nvml.EventSet for use with RegisterEvents.
+ Raw() nvml.EventSet
+}
+
+// nvmlLibraryWrapper wraps the real nvml.Interface to implement Library.
+type nvmlLibraryWrapper struct {
+ lib nvml.Interface
+}
+
+// NewLibraryWrapper creates a Library wrapper around an nvml.Interface.
+func NewLibraryWrapper(lib nvml.Interface) Library {
+ return &nvmlLibraryWrapper{lib: lib}
+}
+
+func (w *nvmlLibraryWrapper) Init() nvml.Return {
+ return w.lib.Init()
+}
+
+func (w *nvmlLibraryWrapper) Shutdown() nvml.Return {
+ return w.lib.Shutdown()
+}
+
+func (w *nvmlLibraryWrapper) SystemGetDriverVersion() (string, nvml.Return) {
+ return w.lib.SystemGetDriverVersion()
+}
+
+func (w *nvmlLibraryWrapper) DeviceGetCount() (int, nvml.Return) {
+ return w.lib.DeviceGetCount()
+}
+
+func (w *nvmlLibraryWrapper) DeviceGetHandleByIndex(index int) (Device, nvml.Return) {
+ device, ret := w.lib.DeviceGetHandleByIndex(index)
+ if ret != nvml.SUCCESS {
+ return nil, ret
+ }
+
+ return &nvmlDeviceWrapper{device: device}, ret
+}
+
+func (w *nvmlLibraryWrapper) EventSetCreate() (EventSet, nvml.Return) {
+ es, ret := w.lib.EventSetCreate()
+ if ret != nvml.SUCCESS {
+ return nil, ret
+ }
+
+ return &nvmlEventSetWrapper{es: es}, ret
+}
+
+// nvmlDeviceWrapper wraps nvml.Device to implement Device.
+type nvmlDeviceWrapper struct {
+ device nvml.Device
+}
+
+func (w *nvmlDeviceWrapper) GetUUID() (string, nvml.Return) {
+ return w.device.GetUUID()
+}
+
+func (w *nvmlDeviceWrapper) GetName() (string, nvml.Return) {
+ return w.device.GetName()
+}
+
+func (w *nvmlDeviceWrapper) GetMemoryInfo() (nvml.Memory, nvml.Return) {
+ return w.device.GetMemoryInfo()
+}
+
+func (w *nvmlDeviceWrapper) GetRetiredPagesPendingStatus() (nvml.EnableState, nvml.Return) {
+ return w.device.GetRetiredPagesPendingStatus()
+}
+
+func (w *nvmlDeviceWrapper) GetSupportedEventTypes() (uint64, nvml.Return) {
+ return w.device.GetSupportedEventTypes()
+}
+
+func (w *nvmlDeviceWrapper) RegisterEvents(eventTypes uint64, set nvml.EventSet) nvml.Return {
+ return w.device.RegisterEvents(eventTypes, set)
+}
+
+// nvmlEventSetWrapper wraps nvml.EventSet to implement EventSet.
+type nvmlEventSetWrapper struct {
+ es nvml.EventSet
+}
+
+func (w *nvmlEventSetWrapper) Wait(timeout uint32) (nvml.EventData, nvml.Return) {
+ return w.es.Wait(timeout)
+}
+
+func (w *nvmlEventSetWrapper) Free() nvml.Return {
+ return w.es.Free()
+}
+
+// Raw returns the underlying nvml.EventSet for use with device.RegisterEvents.
+// This is needed because RegisterEvents expects the concrete nvml.EventSet type.
+func (w *nvmlEventSetWrapper) Raw() nvml.EventSet {
+ return w.es
+}
diff --git a/pkg/providers/nvml/mock_test.go b/pkg/providers/nvml/mock_test.go
new file mode 100644
index 000000000..05785ae64
--- /dev/null
+++ b/pkg/providers/nvml/mock_test.go
@@ -0,0 +1,245 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build nvml
+
+package nvml
+
+import (
+ "sync"
+
+ "github.com/NVIDIA/go-nvml/pkg/nvml"
+)
+
+// MockLibrary is a mock implementation of Library for testing.
+type MockLibrary struct {
+ // Init behavior
+ InitReturn nvml.Return
+
+ // Shutdown behavior
+ ShutdownReturn nvml.Return
+
+ // SystemGetDriverVersion behavior
+ DriverVersion string
+ DriverVersionReturn nvml.Return
+
+ // DeviceGetCount behavior
+ DeviceCount int
+ DeviceCountReturn nvml.Return
+
+ // Devices returns mock devices by index
+ Devices map[int]*MockDevice
+
+ // EventSetCreate behavior
+ EventSet *MockEventSet
+ EventSetCreateReturn nvml.Return
+
+ // Track calls for verification
+ mu sync.Mutex
+ InitCalled bool
+ ShutdownCalled bool
+}
+
+// NewMockLibrary creates a new mock Library with defaults.
+func NewMockLibrary() *MockLibrary {
+ return &MockLibrary{
+ InitReturn: nvml.SUCCESS,
+ ShutdownReturn: nvml.SUCCESS,
+ DriverVersion: "535.104.05",
+ DriverVersionReturn: nvml.SUCCESS,
+ DeviceCount: 0,
+ DeviceCountReturn: nvml.SUCCESS,
+ Devices: make(map[int]*MockDevice),
+ EventSetCreateReturn: nvml.SUCCESS,
+ }
+}
+
+// AddDevice adds a mock device at the specified index.
+func (m *MockLibrary) AddDevice(index int, device *MockDevice) {
+ m.Devices[index] = device
+ m.DeviceCount = len(m.Devices)
+}
+
+// Init implements Library.
+func (m *MockLibrary) Init() nvml.Return {
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ m.InitCalled = true
+
+ return m.InitReturn
+}
+
+// Shutdown implements Library.
+func (m *MockLibrary) Shutdown() nvml.Return {
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ m.ShutdownCalled = true
+
+ return m.ShutdownReturn
+}
+
+// SystemGetDriverVersion implements Library.
+func (m *MockLibrary) SystemGetDriverVersion() (string, nvml.Return) {
+ return m.DriverVersion, m.DriverVersionReturn
+}
+
+// DeviceGetCount implements Library.
+func (m *MockLibrary) DeviceGetCount() (int, nvml.Return) {
+ return m.DeviceCount, m.DeviceCountReturn
+}
+
+// DeviceGetHandleByIndex implements Library.
+func (m *MockLibrary) DeviceGetHandleByIndex(index int) (Device, nvml.Return) {
+ if device, ok := m.Devices[index]; ok {
+ return device, nvml.SUCCESS
+ }
+
+ return nil, nvml.ERROR_NOT_FOUND
+}
+
+// EventSetCreate implements Library.
+func (m *MockLibrary) EventSetCreate() (EventSet, nvml.Return) {
+ if m.EventSet == nil {
+ m.EventSet = NewMockEventSet()
+ }
+
+ return m.EventSet, m.EventSetCreateReturn
+}
+
+// MockDevice is a mock implementation of Device.
+type MockDevice struct {
+ UUID string
+ UUIDReturn nvml.Return
+ Name string
+ NameReturn nvml.Return
+ MemoryInfo nvml.Memory
+ MemoryInfoReturn nvml.Return
+ RetiredPagesPending nvml.EnableState
+ RetiredPagesPendingReturn nvml.Return
+ SupportedEvents uint64
+ SupportedEventsReturn nvml.Return
+ RegisterEventsReturn nvml.Return
+}
+
+// NewMockDevice creates a new mock device with sensible defaults.
+func NewMockDevice(uuid, name string) *MockDevice {
+ return &MockDevice{
+ UUID: uuid,
+ UUIDReturn: nvml.SUCCESS,
+ Name: name,
+ NameReturn: nvml.SUCCESS,
+ MemoryInfo: nvml.Memory{
+ Total: 16 * 1024 * 1024 * 1024, // 16 GB
+ Free: 15 * 1024 * 1024 * 1024,
+ Used: 1 * 1024 * 1024 * 1024,
+ },
+ MemoryInfoReturn: nvml.SUCCESS,
+ SupportedEvents: uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError),
+ SupportedEventsReturn: nvml.SUCCESS,
+ RegisterEventsReturn: nvml.SUCCESS,
+ }
+}
+
+// GetUUID implements Device.
+func (d *MockDevice) GetUUID() (string, nvml.Return) {
+ return d.UUID, d.UUIDReturn
+}
+
+// GetName implements Device.
+func (d *MockDevice) GetName() (string, nvml.Return) {
+ return d.Name, d.NameReturn
+}
+
+// GetMemoryInfo implements Device.
+func (d *MockDevice) GetMemoryInfo() (nvml.Memory, nvml.Return) {
+ return d.MemoryInfo, d.MemoryInfoReturn
+}
+
+// GetRetiredPagesPendingStatus implements Device.
+func (d *MockDevice) GetRetiredPagesPendingStatus() (nvml.EnableState, nvml.Return) {
+ return d.RetiredPagesPending, d.RetiredPagesPendingReturn
+}
+
+// GetSupportedEventTypes implements Device.
+func (d *MockDevice) GetSupportedEventTypes() (uint64, nvml.Return) {
+ return d.SupportedEvents, d.SupportedEventsReturn
+}
+
+// RegisterEvents implements Device.
+func (d *MockDevice) RegisterEvents(_ uint64, _ nvml.EventSet) nvml.Return {
+ return d.RegisterEventsReturn
+}
+
+// MockEventSet is a mock implementation of EventSet.
+type MockEventSet struct {
+ mu sync.Mutex
+ events []nvml.EventData
+ eventIdx int
+ WaitReturn nvml.Return
+ FreeReturn nvml.Return
+ Freed bool
+}
+
+// NewMockEventSet creates a new mock event set.
+func NewMockEventSet() *MockEventSet {
+ return &MockEventSet{
+ events: make([]nvml.EventData, 0),
+ WaitReturn: nvml.ERROR_TIMEOUT,
+ FreeReturn: nvml.SUCCESS,
+ }
+}
+
+// AddEvent adds an event to be returned by Wait.
+func (e *MockEventSet) AddEvent(event nvml.EventData) {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ e.events = append(e.events, event)
+}
+
+// Wait implements EventSet.
+func (e *MockEventSet) Wait(_ uint32) (nvml.EventData, nvml.Return) {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ if e.eventIdx < len(e.events) {
+ event := e.events[e.eventIdx]
+ e.eventIdx++
+
+ return event, nvml.SUCCESS
+ }
+
+ return nvml.EventData{}, e.WaitReturn
+}
+
+// Free implements EventSet.
+func (e *MockEventSet) Free() nvml.Return {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ e.Freed = true
+
+ return e.FreeReturn
+}
+
+// Raw implements EventSet - returns nil for mocks since we don't need real event set.
+func (e *MockEventSet) Raw() nvml.EventSet {
+ return nil
+}
+
+// Compile-time interface checks.
+var (
+ _ Library = (*MockLibrary)(nil)
+ _ Device = (*MockDevice)(nil)
+ _ EventSet = (*MockEventSet)(nil)
+)
+
diff --git a/pkg/providers/nvml/provider.go b/pkg/providers/nvml/provider.go
new file mode 100644
index 000000000..77c26bcd7
--- /dev/null
+++ b/pkg/providers/nvml/provider.go
@@ -0,0 +1,275 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build nvml
+
+// Package nvml provides a built-in NVML-based health provider for the Device API Server.
+//
+// This provider uses NVML (NVIDIA Management Library) to:
+// - Enumerate GPUs on the node at startup
+// - Monitor GPU health via XID error events
+// - Provide baseline device information when no external providers are connected
+//
+// The provider requires the NVIDIA driver to be installed and NVML libraries to be
+// accessible. When running in Kubernetes, this is typically achieved by using the
+// "nvidia" RuntimeClass which injects the driver libraries via the NVIDIA Container
+// Toolkit, without consuming GPU resources.
+package nvml
+
+import (
+ "context"
+ "fmt"
+ "sync"
+
+ "github.com/NVIDIA/go-nvml/pkg/nvml"
+ "k8s.io/klog/v2"
+
+ gpuclient "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned/typed/device/v1alpha1"
+)
+
+// Provider is the built-in NVML-based health provider.
+//
+// It uses NVML to enumerate GPUs and monitor their health status.
+// The provider is optional and gracefully degrades if NVML is unavailable.
+//
+// The provider communicates with the Device API Server via the gRPC client
+// interface, making it a "dogfooding" client of its own API. This design:
+// - Decouples the provider from server internals
+// - Enables running the provider as a separate sidecar process
+// - Validates the API from a provider's perspective
+type Provider struct {
+ // Configuration
+ config Config
+
+ // NVML library interface (uses our wrapper for testability)
+ nvmllib Library
+
+ // Typed client to communicate with Device API Server
+ client gpuclient.GPUInterface
+
+ // Logger
+ logger klog.Logger
+
+ // Health monitoring
+ eventSet EventSet
+ healthMonitor *HealthMonitor
+ monitorRunning bool
+
+ // Lifecycle management
+ mu sync.RWMutex
+ ctx context.Context
+ cancel context.CancelFunc
+ wg sync.WaitGroup
+
+ // State
+ initialized bool
+ gpuCount int
+
+ // Tracked GPU UUIDs for health monitoring
+ gpuUUIDs []string
+
+ // Pre-computed map of additional ignored XIDs for O(1) lookup
+ additionalIgnoredXids map[uint64]bool
+}
+
+// Config holds configuration for the NVML provider.
+type Config struct {
+ // DriverRoot is the root path where NVIDIA driver libraries are located.
+ // Common values:
+ // - "/run/nvidia/driver" (container with CDI/RuntimeClass)
+ // - "/" (bare metal or host path mount)
+ DriverRoot string
+
+ // AdditionalIgnoredXids is a list of additional XID error codes to ignore.
+ // These are added to the default list of ignored XIDs (application errors).
+ AdditionalIgnoredXids []uint64
+
+ // HealthCheckEnabled enables XID event monitoring for health checks.
+ // When disabled, only device enumeration is performed.
+ HealthCheckEnabled bool
+}
+
+// DefaultConfig returns a Config with sensible defaults.
+func DefaultConfig() Config {
+ return Config{
+ DriverRoot: "/run/nvidia/driver",
+ AdditionalIgnoredXids: nil,
+ HealthCheckEnabled: true,
+ }
+}
+
+// New creates a new NVML provider.
+//
+// The provider is not started until Start() is called. If NVML cannot be
+// initialized (e.g., no driver installed), Start() will return an error
+// but the server can continue without NVML support.
+//
+// The client parameter is a GPUInterface used to communicate with the
+// Device API Server. This enables the provider to be either:
+// - Co-located with the server (using a loopback connection)
+// - Running as a separate sidecar process (using a network connection)
+func New(cfg Config, client gpuclient.GPUInterface, logger klog.Logger) *Provider {
+ logger = logger.WithName("nvml-provider")
+
+ // Find NVML library path
+ libraryPath := FindDriverLibrary(cfg.DriverRoot)
+ logger.V(2).Info("Using NVML library path", "path", libraryPath)
+
+ // Create NVML interface with explicit library path
+ var rawLib nvml.Interface
+ if libraryPath != "" {
+ rawLib = nvml.New(nvml.WithLibraryPath(libraryPath))
+ } else {
+ // Fall back to system default
+ rawLib = nvml.New()
+ }
+
+ return &Provider{
+ config: cfg,
+ nvmllib: NewLibraryWrapper(rawLib),
+ client: client,
+ logger: logger,
+ }
+}
+
+// Start initializes NVML and enumerates GPUs.
+//
+// If health checking is enabled, it also starts the XID event monitoring
+// goroutine. Returns an error if NVML cannot be initialized.
+func (p *Provider) Start(ctx context.Context) error {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ if p.initialized {
+ return fmt.Errorf("provider already started")
+ }
+
+ p.logger.Info("Starting NVML provider")
+
+ // Initialize NVML
+ ret := p.nvmllib.Init()
+ if ret != nvml.SUCCESS {
+ return fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret))
+ }
+
+ // Get driver version for logging
+ driverVersion, ret := p.nvmllib.SystemGetDriverVersion()
+ if ret == nvml.SUCCESS {
+ p.logger.Info("NVML initialized", "driverVersion", driverVersion)
+ }
+
+ // Build map of additional ignored XIDs for O(1) lookup
+ if len(p.config.AdditionalIgnoredXids) > 0 {
+ p.additionalIgnoredXids = make(map[uint64]bool, len(p.config.AdditionalIgnoredXids))
+ for _, xid := range p.config.AdditionalIgnoredXids {
+ p.additionalIgnoredXids[xid] = true
+ }
+ }
+
+ // Set up context for lifecycle management (must be before enumerateDevices,
+ // which uses p.ctx for gRPC calls)
+ p.ctx, p.cancel = context.WithCancel(ctx)
+
+ // Enumerate devices
+ count, err := p.enumerateDevices()
+ if err != nil {
+ p.cancel()
+ p.ctx = nil
+ p.cancel = nil
+ _ = p.nvmllib.Shutdown()
+
+ return fmt.Errorf("failed to enumerate devices: %w", err)
+ }
+
+ p.gpuCount = count
+
+ p.logger.Info("Enumerated GPUs", "count", count)
+
+ p.initialized = true
+
+ // Start health monitoring if enabled and we have GPUs
+ if p.config.HealthCheckEnabled && count > 0 {
+ if err := p.startHealthMonitoring(); err != nil {
+ p.logger.Error(err, "Failed to start health monitoring, continuing without it")
+ // Don't fail - health monitoring is optional
+ }
+ }
+
+ return nil
+}
+
+// Stop shuts down the NVML provider.
+//
+// It stops health monitoring (if running) and shuts down NVML.
+// This method is safe to call multiple times.
+func (p *Provider) Stop() {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ if !p.initialized {
+ return
+ }
+
+ p.logger.Info("Stopping NVML provider")
+
+ // Cancel context to stop health monitoring
+ if p.cancel != nil {
+ p.cancel()
+ }
+
+ // Wait for health monitor to stop
+ p.wg.Wait()
+
+ // Clean up event set
+ if p.eventSet != nil {
+ if ret := p.eventSet.Free(); ret != nvml.SUCCESS {
+ p.logger.V(1).Info("Failed to free event set", "error", nvml.ErrorString(ret))
+ }
+
+ p.eventSet = nil
+ }
+
+ // Shutdown NVML
+ if ret := p.nvmllib.Shutdown(); ret != nvml.SUCCESS {
+ p.logger.V(1).Info("Failed to shutdown NVML", "error", nvml.ErrorString(ret))
+ }
+
+ p.initialized = false
+ p.monitorRunning = false
+ p.logger.Info("NVML provider stopped")
+}
+
+// IsInitialized returns true if the provider has been successfully started.
+func (p *Provider) IsInitialized() bool {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ return p.initialized
+}
+
+// GPUCount returns the number of GPUs discovered.
+func (p *Provider) GPUCount() int {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ return p.gpuCount
+}
+
+// IsHealthMonitorRunning returns true if health monitoring is active.
+func (p *Provider) IsHealthMonitorRunning() bool {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ return p.monitorRunning
+}
diff --git a/pkg/providers/nvml/provider_test.go b/pkg/providers/nvml/provider_test.go
new file mode 100644
index 000000000..244fc106b
--- /dev/null
+++ b/pkg/providers/nvml/provider_test.go
@@ -0,0 +1,606 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build nvml
+
+package nvml
+
+import (
+ "context"
+ "testing"
+ "time"
+
+ "github.com/NVIDIA/go-nvml/pkg/nvml"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/watch"
+ "k8s.io/klog/v2"
+
+ devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1"
+ gpuclient "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned/typed/device/v1alpha1"
+ "github.com/nvidia/nvsentinel/pkg/testutil"
+)
+
+// testLogger returns a test logger.
+func testLogger() klog.Logger {
+ return klog.NewKlogr().WithName("test")
+}
+
+// TestProvider_Start_Success tests successful provider initialization.
+func TestProvider_Start_Success(t *testing.T) {
+ mockLib := NewMockLibrary()
+ mockLib.AddDevice(0, NewMockDevice("GPU-uuid-0", "NVIDIA A100"))
+ mockLib.AddDevice(1, NewMockDevice("GPU-uuid-1", "NVIDIA A100"))
+
+ client := testutil.NewTestGPUTypedClient(t)
+
+ provider := &Provider{
+ config: DefaultConfig(),
+ nvmllib: mockLib,
+ client: client,
+ logger: testLogger(),
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ err := provider.Start(ctx)
+ if err != nil {
+ t.Fatalf("Start() failed: %v", err)
+ }
+ defer provider.Stop()
+
+ // Verify NVML was initialized
+ if !mockLib.InitCalled {
+ t.Error("Init() was not called")
+ }
+
+ // Verify GPUs were registered
+ gpuList, err := client.List(context.Background(), metav1.ListOptions{})
+ if err != nil {
+ t.Fatalf("List failed: %v", err)
+ }
+ if len(gpuList.Items) != 2 {
+ t.Errorf("Expected 2 GPUs, got %d", len(gpuList.Items))
+ }
+
+ // Verify provider state
+ if !provider.IsInitialized() {
+ t.Error("Provider should be initialized")
+ }
+
+ if provider.GPUCount() != 2 {
+ t.Errorf("Expected GPUCount() = 2, got %d", provider.GPUCount())
+ }
+}
+
+// TestProvider_Start_NVMLInitFails tests graceful handling of NVML init failure.
+func TestProvider_Start_NVMLInitFails(t *testing.T) {
+ mockLib := NewMockLibrary()
+ mockLib.InitReturn = nvml.ERROR_LIBRARY_NOT_FOUND
+
+ client := testutil.NewTestGPUTypedClient(t)
+
+ provider := &Provider{
+ config: DefaultConfig(),
+ nvmllib: mockLib,
+ client: client,
+ logger: testLogger(),
+ }
+
+ ctx := context.Background()
+ err := provider.Start(ctx)
+
+ if err == nil {
+ t.Fatal("Expected Start() to fail when NVML init fails")
+ }
+
+ if provider.IsInitialized() {
+ t.Error("Provider should not be initialized after failure")
+ }
+}
+
+// TestProvider_Start_NoGPUs tests handling of nodes without GPUs.
+func TestProvider_Start_NoGPUs(t *testing.T) {
+ mockLib := NewMockLibrary()
+ mockLib.DeviceCount = 0
+
+ client := testutil.NewTestGPUTypedClient(t)
+
+ provider := &Provider{
+ config: DefaultConfig(),
+ nvmllib: mockLib,
+ client: client,
+ logger: testLogger(),
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ err := provider.Start(ctx)
+ if err != nil {
+ t.Fatalf("Start() failed: %v", err)
+ }
+ defer provider.Stop()
+
+ if provider.GPUCount() != 0 {
+ t.Errorf("Expected 0 GPUs, got %d", provider.GPUCount())
+ }
+
+ // Health monitor should not be running with 0 GPUs
+ if provider.IsHealthMonitorRunning() {
+ t.Error("Health monitor should not run with 0 GPUs")
+ }
+}
+
+// TestProvider_Start_AlreadyStarted tests double-start prevention.
+func TestProvider_Start_AlreadyStarted(t *testing.T) {
+ mockLib := NewMockLibrary()
+ client := testutil.NewTestGPUTypedClient(t)
+
+ provider := &Provider{
+ config: DefaultConfig(),
+ nvmllib: mockLib,
+ client: client,
+ logger: testLogger(),
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ // First start
+ err := provider.Start(ctx)
+ if err != nil {
+ t.Fatalf("First Start() failed: %v", err)
+ }
+ defer provider.Stop()
+
+ // Second start should fail
+ err = provider.Start(ctx)
+ if err == nil {
+ t.Error("Second Start() should fail")
+ }
+}
+
+// TestProvider_Stop tests provider shutdown.
+func TestProvider_Stop(t *testing.T) {
+ mockLib := NewMockLibrary()
+ mockLib.AddDevice(0, NewMockDevice("GPU-uuid-0", "NVIDIA A100"))
+
+ client := testutil.NewTestGPUTypedClient(t)
+
+ provider := &Provider{
+ config: DefaultConfig(),
+ nvmllib: mockLib,
+ client: client,
+ logger: testLogger(),
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ err := provider.Start(ctx)
+ if err != nil {
+ t.Fatalf("Start() failed: %v", err)
+ }
+
+ // Stop the provider
+ provider.Stop()
+
+ // Verify state
+ if provider.IsInitialized() {
+ t.Error("Provider should not be initialized after Stop()")
+ }
+
+ if !mockLib.ShutdownCalled {
+ t.Error("NVML Shutdown() was not called")
+ }
+
+ // Double stop should be safe
+ provider.Stop()
+}
+
+// TestProvider_Stop_NotStarted tests Stop() on unstarted provider.
+func TestProvider_Stop_NotStarted(t *testing.T) {
+ mockLib := NewMockLibrary()
+ client := testutil.NewTestGPUTypedClient(t)
+
+ provider := &Provider{
+ config: DefaultConfig(),
+ nvmllib: mockLib,
+ client: client,
+ logger: testLogger(),
+ }
+
+ // Stop should be safe even if not started
+ provider.Stop()
+
+ if mockLib.ShutdownCalled {
+ t.Error("Shutdown() should not be called if provider was never started")
+ }
+}
+
+// TestProvider_DeviceEnumeration tests that devices are properly enumerated.
+func TestProvider_DeviceEnumeration(t *testing.T) {
+ mockLib := NewMockLibrary()
+
+ // Add devices with varying configurations
+ device0 := NewMockDevice("GPU-11111111-1111-1111-1111-111111111111", "NVIDIA H100")
+ device0.MemoryInfo = nvml.Memory{Total: 80 * 1024 * 1024 * 1024} // 80 GB
+
+ device1 := NewMockDevice("GPU-22222222-2222-2222-2222-222222222222", "NVIDIA A100")
+ device1.MemoryInfo = nvml.Memory{Total: 40 * 1024 * 1024 * 1024} // 40 GB
+
+ mockLib.AddDevice(0, device0)
+ mockLib.AddDevice(1, device1)
+
+ client := testutil.NewTestGPUTypedClient(t)
+
+ provider := &Provider{
+ config: DefaultConfig(),
+ nvmllib: mockLib,
+ client: client,
+ logger: testLogger(),
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ err := provider.Start(ctx)
+ if err != nil {
+ t.Fatalf("Start() failed: %v", err)
+ }
+ defer provider.Stop()
+
+ // Verify both devices are registered
+ gpuList, err := client.List(context.Background(), metav1.ListOptions{})
+ if err != nil {
+ t.Fatalf("List failed: %v", err)
+ }
+ gpus := gpuList.Items
+ if len(gpus) != 2 {
+ t.Fatalf("Expected 2 GPUs, got %d", len(gpus))
+ }
+
+ // Verify GPU details
+ uuids := make(map[string]bool)
+ for _, gpu := range gpus {
+ uuids[gpu.Name] = true
+
+ // Check initial condition
+ if len(gpu.Status.Conditions) == 0 {
+ t.Errorf("GPU %s has no conditions", gpu.Name)
+ continue
+ }
+
+ cond := gpu.Status.Conditions[0]
+ if cond.Type != ConditionTypeNVMLReady {
+ t.Errorf("Expected condition type %s, got %s", ConditionTypeNVMLReady, cond.Type)
+ }
+
+ if cond.Status != metav1.ConditionStatus(ConditionStatusTrue) {
+ t.Errorf("Expected condition status True, got %s", cond.Status)
+ }
+ }
+
+ if !uuids["GPU-11111111-1111-1111-1111-111111111111"] {
+ t.Error("GPU-11111111... not found in cache")
+ }
+
+ if !uuids["GPU-22222222-2222-2222-2222-222222222222"] {
+ t.Error("GPU-22222222... not found in cache")
+ }
+}
+
+// TestProvider_DeviceEnumeration_PartialFailure tests handling of partial device failures.
+func TestProvider_DeviceEnumeration_PartialFailure(t *testing.T) {
+ mockLib := NewMockLibrary()
+
+ // First device is fine
+ mockLib.AddDevice(0, NewMockDevice("GPU-good", "NVIDIA A100"))
+
+ // Second device fails UUID retrieval
+ device1 := NewMockDevice("GPU-bad", "NVIDIA A100")
+ device1.UUIDReturn = nvml.ERROR_UNKNOWN
+ mockLib.AddDevice(1, device1)
+
+ // Third device is fine
+ mockLib.AddDevice(2, NewMockDevice("GPU-good-2", "NVIDIA A100"))
+
+ client := testutil.NewTestGPUTypedClient(t)
+
+ provider := &Provider{
+ config: DefaultConfig(),
+ nvmllib: mockLib,
+ client: client,
+ logger: testLogger(),
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ err := provider.Start(ctx)
+ if err != nil {
+ t.Fatalf("Start() failed: %v", err)
+ }
+ defer provider.Stop()
+
+ // Only 2 GPUs should be registered (one failed)
+ gpuList, err := client.List(context.Background(), metav1.ListOptions{})
+ if err != nil {
+ t.Fatalf("List failed: %v", err)
+ }
+ if len(gpuList.Items) != 2 {
+ t.Errorf("Expected 2 GPUs (1 failed), got %d", len(gpuList.Items))
+ }
+}
+
+// TestProvider_HealthCheckDisabled tests that health monitoring can be disabled.
+func TestProvider_HealthCheckDisabled(t *testing.T) {
+ mockLib := NewMockLibrary()
+ mockLib.AddDevice(0, NewMockDevice("GPU-uuid-0", "NVIDIA A100"))
+
+ client := testutil.NewTestGPUTypedClient(t)
+
+ config := DefaultConfig()
+ config.HealthCheckEnabled = false
+
+ provider := &Provider{
+ config: config,
+ nvmllib: mockLib,
+ client: client,
+ logger: testLogger(),
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ err := provider.Start(ctx)
+ if err != nil {
+ t.Fatalf("Start() failed: %v", err)
+ }
+ defer provider.Stop()
+
+ // Give a moment for any goroutines to start
+ time.Sleep(10 * time.Millisecond)
+
+ if provider.IsHealthMonitorRunning() {
+ t.Error("Health monitor should not be running when disabled")
+ }
+}
+
+// TestProvider_UpdateCondition tests condition updates.
+func TestProvider_UpdateCondition(t *testing.T) {
+ mockLib := NewMockLibrary()
+ mockLib.AddDevice(0, NewMockDevice("GPU-uuid-0", "NVIDIA A100"))
+
+ client := testutil.NewTestGPUTypedClient(t)
+
+ config := DefaultConfig()
+ config.HealthCheckEnabled = false
+
+ provider := &Provider{
+ config: config,
+ nvmllib: mockLib,
+ client: client,
+ logger: testLogger(),
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ err := provider.Start(ctx)
+ if err != nil {
+ t.Fatalf("Start() failed: %v", err)
+ }
+ defer provider.Stop()
+
+ // Update condition to unhealthy
+ err = provider.UpdateCondition("GPU-uuid-0", ConditionTypeNVMLReady, ConditionStatusFalse, "XidError", "Critical XID 48")
+ if err != nil {
+ t.Fatalf("UpdateCondition() failed: %v", err)
+ }
+
+ // Verify condition was updated
+ gpu, err := client.Get(context.Background(), "GPU-uuid-0", metav1.GetOptions{})
+ if err != nil {
+ t.Fatalf("Get failed: %v", err)
+ }
+
+ var foundCondition bool
+
+ for _, cond := range gpu.Status.Conditions {
+ if cond.Type == ConditionTypeNVMLReady {
+ foundCondition = true
+
+ if string(cond.Status) != ConditionStatusFalse {
+ t.Errorf("Expected status False, got %s", cond.Status)
+ }
+
+ if cond.Reason != "XidError" {
+ t.Errorf("Expected reason XidError, got %s", cond.Reason)
+ }
+ }
+ }
+
+ if !foundCondition {
+ t.Error("NVMLReady condition not found")
+ }
+}
+
+// TestProvider_UpdateCondition_GPUNotFound tests condition update for non-existent GPU.
+func TestProvider_UpdateCondition_GPUNotFound(t *testing.T) {
+ mockLib := NewMockLibrary()
+ client := testutil.NewTestGPUTypedClient(t)
+
+ config := DefaultConfig()
+ config.HealthCheckEnabled = false
+
+ provider := &Provider{
+ config: config,
+ nvmllib: mockLib,
+ client: client,
+ logger: testLogger(),
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ err := provider.Start(ctx)
+ if err != nil {
+ t.Fatalf("Start() failed: %v", err)
+ }
+ defer provider.Stop()
+
+ // Try to update condition for non-existent GPU
+ err = provider.UpdateCondition("GPU-nonexistent", ConditionTypeNVMLReady, ConditionStatusFalse, "XidError", "Test")
+ if err == nil {
+ t.Error("Expected error for non-existent GPU")
+ }
+}
+
+// TestProvider_MarkHealthy tests marking a GPU as healthy.
+func TestProvider_MarkHealthy(t *testing.T) {
+ mockLib := NewMockLibrary()
+ mockLib.AddDevice(0, NewMockDevice("GPU-uuid-0", "NVIDIA A100"))
+
+ client := testutil.NewTestGPUTypedClient(t)
+
+ config := DefaultConfig()
+ config.HealthCheckEnabled = false
+
+ provider := &Provider{
+ config: config,
+ nvmllib: mockLib,
+ client: client,
+ logger: testLogger(),
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ err := provider.Start(ctx)
+ if err != nil {
+ t.Fatalf("Start() failed: %v", err)
+ }
+ defer provider.Stop()
+
+ // First mark as unhealthy
+ err = provider.UpdateCondition("GPU-uuid-0", ConditionTypeNVMLReady, ConditionStatusFalse, "XidError", "Test")
+ if err != nil {
+ t.Fatalf("UpdateCondition() failed: %v", err)
+ }
+
+ // Then mark as healthy
+ err = provider.MarkHealthy("GPU-uuid-0")
+ if err != nil {
+ t.Fatalf("MarkHealthy() failed: %v", err)
+ }
+
+ // Verify it's healthy
+ gpu, err := client.Get(context.Background(), "GPU-uuid-0", metav1.GetOptions{})
+ if err != nil {
+ t.Fatalf("Get failed: %v", err)
+ }
+
+ for _, cond := range gpu.Status.Conditions {
+ if cond.Type == ConditionTypeNVMLReady {
+ if string(cond.Status) != ConditionStatusTrue {
+ t.Errorf("Expected status True after MarkHealthy, got %s", cond.Status)
+ }
+
+ return
+ }
+ }
+
+ t.Error("NVMLReady condition not found")
+}
+
+// contextCapturingClient wraps a GPUInterface and captures the context
+// passed to Create. This allows tests to verify that enumerateDevices
+// receives a non-nil context.
+type contextCapturingClient struct {
+ inner gpuclient.GPUInterface
+ capturedCtx context.Context
+}
+
+func newContextCapturingClient(inner gpuclient.GPUInterface) *contextCapturingClient {
+ return &contextCapturingClient{inner: inner}
+}
+
+func (c *contextCapturingClient) Create(ctx context.Context, gpu *devicev1alpha1.GPU, opts metav1.CreateOptions) (*devicev1alpha1.GPU, error) {
+ c.capturedCtx = ctx
+ return c.inner.Create(ctx, gpu, opts)
+}
+
+func (c *contextCapturingClient) Get(ctx context.Context, name string, opts metav1.GetOptions) (*devicev1alpha1.GPU, error) {
+ return c.inner.Get(ctx, name, opts)
+}
+
+func (c *contextCapturingClient) Update(ctx context.Context, gpu *devicev1alpha1.GPU, opts metav1.UpdateOptions) (*devicev1alpha1.GPU, error) {
+ return c.inner.Update(ctx, gpu, opts)
+}
+
+func (c *contextCapturingClient) UpdateStatus(ctx context.Context, gpu *devicev1alpha1.GPU, opts metav1.UpdateOptions) (*devicev1alpha1.GPU, error) {
+ return c.inner.UpdateStatus(ctx, gpu, opts)
+}
+
+func (c *contextCapturingClient) List(ctx context.Context, opts metav1.ListOptions) (*devicev1alpha1.GPUList, error) {
+ return c.inner.List(ctx, opts)
+}
+
+func (c *contextCapturingClient) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
+ return c.inner.Delete(ctx, name, opts)
+}
+
+func (c *contextCapturingClient) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) {
+ return c.inner.Watch(ctx, opts)
+}
+
+// TestProvider_Start_ContextSetBeforeEnumerate verifies that enumerateDevices
+// receives a non-nil context. Before the fix, p.ctx was nil when
+// enumerateDevices was called, which would cause a gRPC panic on any real
+// gRPC client.
+func TestProvider_Start_ContextSetBeforeEnumerate(t *testing.T) {
+ mockLib := NewMockLibrary()
+ mockLib.AddDevice(0, NewMockDevice("GPU-ctx-test", "NVIDIA A100"))
+
+ typedClient := testutil.NewTestGPUTypedClient(t)
+ capturingClient := newContextCapturingClient(typedClient)
+
+ provider := &Provider{
+ config: Config{HealthCheckEnabled: false},
+ nvmllib: mockLib,
+ client: capturingClient,
+ logger: testLogger(),
+ }
+
+ ctx := context.Background()
+ err := provider.Start(ctx)
+ if err != nil {
+ t.Fatalf("Start() failed: %v", err)
+ }
+ defer provider.Stop()
+
+ // The capturing client recorded the context passed to Create during
+ // enumerateDevices. If the fix is missing, this will be nil because p.ctx
+ // was not set before enumerateDevices was called.
+ if capturingClient.capturedCtx == nil {
+ t.Fatal("Create was called with nil context; p.ctx must be set before enumerateDevices()")
+ }
+
+ // Also verify p.ctx is set after Start returns.
+ if provider.ctx == nil {
+ t.Fatal("p.ctx should be set after Start()")
+ }
+}
diff --git a/pkg/providers/nvml/shared.go b/pkg/providers/nvml/shared.go
new file mode 100644
index 000000000..d33c58619
--- /dev/null
+++ b/pkg/providers/nvml/shared.go
@@ -0,0 +1,85 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nvml
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+)
+
+// Condition constants for NVML provider.
+const (
+ // ConditionTypeNVMLReady is the condition type for NVML health status.
+ ConditionTypeNVMLReady = "NVMLReady"
+
+ // ConditionSourceNVML is the source identifier for conditions set by NVML provider.
+ ConditionSourceNVML = "nvml-provider"
+
+ // ConditionStatusTrue indicates the condition is met.
+ ConditionStatusTrue = "True"
+
+ // ConditionStatusFalse indicates the condition is not met.
+ ConditionStatusFalse = "False"
+
+ // ConditionStatusUnknown indicates the condition status is unknown.
+ ConditionStatusUnknown = "Unknown"
+)
+
+// FormatBytes formats bytes to a human-readable string.
+func FormatBytes(bytes uint64) string {
+ const (
+ KB = 1024
+ MB = KB * 1024
+ GB = MB * 1024
+ )
+
+ switch {
+ case bytes >= GB:
+ return fmt.Sprintf("%.1f GB", float64(bytes)/float64(GB))
+ case bytes >= MB:
+ return fmt.Sprintf("%.1f MB", float64(bytes)/float64(MB))
+ case bytes >= KB:
+ return fmt.Sprintf("%.1f KB", float64(bytes)/float64(KB))
+ default:
+ return fmt.Sprintf("%d B", bytes)
+ }
+}
+
+// FindDriverLibrary locates the NVML library in the driver root.
+//
+// It searches common paths where libnvidia-ml.so.1 might be located.
+// Returns empty string if not found (will use system default).
+func FindDriverLibrary(driverRoot string) string {
+ if driverRoot == "" {
+ return ""
+ }
+
+ searchPaths := []string{
+ filepath.Join(driverRoot, "usr/lib64/libnvidia-ml.so.1"),
+ filepath.Join(driverRoot, "usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1"),
+ filepath.Join(driverRoot, "usr/lib/libnvidia-ml.so.1"),
+ filepath.Join(driverRoot, "lib64/libnvidia-ml.so.1"),
+ filepath.Join(driverRoot, "lib/libnvidia-ml.so.1"),
+ }
+
+ for _, path := range searchPaths {
+ if _, err := os.Stat(path); err == nil {
+ return path
+ }
+ }
+
+ return ""
+}
diff --git a/pkg/providers/nvml/stub.go b/pkg/providers/nvml/stub.go
new file mode 100644
index 000000000..c2b7baf7e
--- /dev/null
+++ b/pkg/providers/nvml/stub.go
@@ -0,0 +1,80 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !nvml
+
+// Package nvml provides a built-in NVML-based health provider for the Device API Server.
+//
+// This stub file is used when NVML support is not compiled in (build without -tags=nvml).
+package nvml
+
+import (
+ "context"
+ "errors"
+
+ "k8s.io/klog/v2"
+
+ gpuclient "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned/typed/device/v1alpha1"
+)
+
+// ErrNVMLNotCompiled is returned when NVML support is not compiled into the binary.
+var ErrNVMLNotCompiled = errors.New("NVML support not compiled in (build with -tags=nvml)")
+
+// Provider is the built-in NVML-based health provider (stub when not compiled).
+type Provider struct{}
+
+// Config holds configuration for the NVML provider.
+type Config struct {
+ DriverRoot string
+ AdditionalIgnoredXids []uint64
+ HealthCheckEnabled bool
+}
+
+// DefaultConfig returns a Config with sensible defaults.
+func DefaultConfig() Config {
+ return Config{
+ DriverRoot: "/run/nvidia/driver",
+ AdditionalIgnoredXids: nil,
+ HealthCheckEnabled: true,
+ }
+}
+
+// New creates a new NVML provider (stub).
+func New(cfg Config, client gpuclient.GPUInterface, logger klog.Logger) *Provider {
+ return &Provider{}
+}
+
+// Start initializes NVML (stub - always returns error).
+func (p *Provider) Start(ctx context.Context) error {
+ return ErrNVMLNotCompiled
+}
+
+// Stop shuts down the NVML provider (stub - no-op).
+func (p *Provider) Stop() {}
+
+// IsInitialized returns false (stub).
+func (p *Provider) IsInitialized() bool {
+ return false
+}
+
+// GPUCount returns 0 (stub).
+func (p *Provider) GPUCount() int {
+ return 0
+}
+
+// IsHealthMonitorRunning returns false (stub).
+func (p *Provider) IsHealthMonitorRunning() bool {
+ return false
+}
+
diff --git a/pkg/providers/nvml/xid.go b/pkg/providers/nvml/xid.go
new file mode 100644
index 000000000..718bb3814
--- /dev/null
+++ b/pkg/providers/nvml/xid.go
@@ -0,0 +1,213 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nvml
+
+import (
+ "strconv"
+ "strings"
+)
+
+// XID errors documentation:
+// https://docs.nvidia.com/deploy/xid-errors/index.html
+
+// defaultIgnoredXids contains XID error codes that are typically caused by
+// application errors rather than hardware failures. These are ignored by
+// default to avoid false positives in health monitoring.
+//
+// Reference: https://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
+var defaultIgnoredXids = map[uint64]bool{
+ // Application errors - GPU should still be healthy
+ 13: true, // Graphics Engine Exception
+ 31: true, // GPU memory page fault
+ 43: true, // GPU stopped processing
+ 45: true, // Preemptive cleanup, due to previous errors
+ 68: true, // Video processor exception
+ 109: true, // Context Switch Timeout Error
+}
+
+// criticalXids contains XID error codes that indicate critical hardware
+// failures requiring immediate attention.
+var criticalXids = map[uint64]bool{
+ // Memory errors
+ 48: true, // Double Bit ECC Error
+ 63: true, // Row remapping failure
+ 64: true, // Uncontained ECC error
+ 74: true, // NVLink error
+ 79: true, // GPU has fallen off the bus
+
+ // Fatal errors
+ 94: true, // Contained ECC error (severe)
+ 95: true, // Uncontained ECC error
+ 119: true, // GSP (GPU System Processor) error
+ 120: true, // GSP firmware error
+}
+
+// XidDescriptions provides human-readable descriptions for common XIDs.
+var XidDescriptions = map[uint64]string{
+ // Application errors (typically ignored)
+ 13: "Graphics Engine Exception",
+ 31: "GPU memory page fault",
+ 43: "GPU stopped processing",
+ 45: "Preemptive cleanup",
+ 68: "Video processor exception",
+ 109: "Context Switch Timeout",
+
+ // Memory errors
+ 48: "Double Bit ECC Error",
+ 63: "Row remapping failure",
+ 64: "Uncontained ECC error",
+ 74: "NVLink error",
+ 79: "GPU has fallen off the bus",
+ 94: "Contained ECC error",
+ 95: "Uncontained ECC error",
+
+ // Other notable XIDs
+ 8: "GPU not accessible",
+ 32: "Invalid or corrupted push buffer stream",
+ 38: "Driver firmware error",
+ 56: "Display engine error",
+ 57: "Error programming video memory interface",
+ 62: "Internal micro-controller halt (non-fatal)",
+ 69: "Graphics engine accessor error",
+ 119: "GSP error",
+ 120: "GSP firmware error",
+}
+
+// IsDefaultIgnored returns true if the XID is in the default ignored set.
+func IsDefaultIgnored(xid uint64) bool {
+ return defaultIgnoredXids[xid]
+}
+
+// IsCritical returns true if the XID is in the critical set.
+func IsCritical(xid uint64) bool {
+ return criticalXids[xid]
+}
+
+// DefaultIgnoredXidsList returns a copy of the default ignored XID set.
+func DefaultIgnoredXidsList() map[uint64]bool {
+ out := make(map[uint64]bool, len(defaultIgnoredXids))
+ for k, v := range defaultIgnoredXids {
+ out[k] = v
+ }
+ return out
+}
+
+// isIgnoredXid returns true if the XID should be ignored for health purposes.
+//
+// An XID is ignored if it's in the default ignored list OR in the additional
+// ignored map provided by the user. The map is built once at provider startup
+// from the config slice for O(1) lookup.
+func isIgnoredXid(xid uint64, additionalIgnored map[uint64]bool) bool {
+ if defaultIgnoredXids[xid] {
+ return true
+ }
+
+ return additionalIgnored[xid]
+}
+
+// IsCriticalXid returns true if the XID indicates a critical hardware failure.
+func IsCriticalXid(xid uint64) bool {
+ return criticalXids[xid]
+}
+
+// xidToString returns a human-readable description for an XID.
+func xidToString(xid uint64) string {
+ if desc, ok := XidDescriptions[xid]; ok {
+ return desc
+ }
+
+ return "Unknown XID"
+}
+
+// ParseIgnoredXids parses a comma-or-space-separated string of XID values.
+// Non-numeric tokens are silently skipped.
+func ParseIgnoredXids(input string) []uint64 {
+ if input == "" {
+ return nil
+ }
+
+ var result []uint64
+
+ tokens := strings.FieldsFunc(input, func(r rune) bool {
+ return r == ',' || r == ' '
+ })
+
+ for _, tok := range tokens {
+ v, err := strconv.ParseUint(tok, 10, 64)
+ if err != nil {
+ continue
+ }
+
+ result = append(result, v)
+ }
+
+ if len(result) == 0 {
+ return nil
+ }
+
+ return result
+}
+
+// XidSeverity represents the severity level of an XID error.
+type XidSeverity int
+
+const (
+ // XidSeverityUnknown indicates the XID severity is unknown.
+ XidSeverityUnknown XidSeverity = iota
+ // XidSeverityIgnored indicates the XID is typically caused by applications.
+ XidSeverityIgnored
+ // XidSeverityWarning indicates the XID may indicate a problem.
+ XidSeverityWarning
+ // XidSeverityCritical indicates the XID indicates a critical hardware failure.
+ XidSeverityCritical
+)
+
+// Severity string constants.
+const (
+ severityUnknown = "unknown"
+ severityIgnored = "ignored"
+ severityWarning = "warning"
+ severityCritical = "critical"
+)
+
+// GetXidSeverity returns the severity level for an XID.
+func GetXidSeverity(xid uint64) XidSeverity {
+ if defaultIgnoredXids[xid] {
+ return XidSeverityIgnored
+ }
+
+ if criticalXids[xid] {
+ return XidSeverityCritical
+ }
+
+ // XIDs not in either list are treated as warnings
+ return XidSeverityWarning
+}
+
+// String returns a string representation of XidSeverity.
+func (s XidSeverity) String() string {
+ switch s {
+ case XidSeverityUnknown:
+ return severityUnknown
+ case XidSeverityIgnored:
+ return severityIgnored
+ case XidSeverityWarning:
+ return severityWarning
+ case XidSeverityCritical:
+ return severityCritical
+ default:
+ return severityUnknown
+ }
+}
diff --git a/pkg/providers/nvml/xid_test.go b/pkg/providers/nvml/xid_test.go
new file mode 100644
index 000000000..f6d9eadaf
--- /dev/null
+++ b/pkg/providers/nvml/xid_test.go
@@ -0,0 +1,279 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nvml
+
+import (
+ "math"
+ "testing"
+)
+
+func TestIsIgnoredXid_DefaultIgnored(t *testing.T) {
+ // Test default ignored XIDs
+ defaultIgnored := []uint64{13, 31, 43, 45, 68, 109}
+
+ for _, xid := range defaultIgnored {
+ if !isIgnoredXid(xid, nil) {
+ t.Errorf("XID %d should be ignored by default", xid)
+ }
+ }
+}
+
+func TestIsIgnoredXid_CriticalNotIgnored(t *testing.T) {
+ // Test critical XIDs are not ignored by default
+ criticalXids := []uint64{48, 63, 64, 74, 79, 94, 95, 119, 120}
+
+ for _, xid := range criticalXids {
+ if isIgnoredXid(xid, nil) {
+ t.Errorf("Critical XID %d should not be ignored by default", xid)
+ }
+ }
+}
+
+func TestIsIgnoredXid_AdditionalIgnored(t *testing.T) {
+ // Test additional ignored XIDs
+ additionalIgnored := map[uint64]bool{48: true, 63: true} // Make critical XIDs ignored
+
+ // Normally critical, but now ignored
+ if !isIgnoredXid(48, additionalIgnored) {
+ t.Error("XID 48 should be ignored when in additional list")
+ }
+
+ if !isIgnoredXid(63, additionalIgnored) {
+ t.Error("XID 63 should be ignored when in additional list")
+ }
+
+ // Still critical (not in additional list)
+ if isIgnoredXid(64, additionalIgnored) {
+ t.Error("XID 64 should not be ignored (not in additional list)")
+ }
+}
+
+func TestIsIgnoredXid_UnknownXid(t *testing.T) {
+ // Unknown XIDs should not be ignored
+ unknownXids := []uint64{1, 2, 3, 999, 12345}
+
+ for _, xid := range unknownXids {
+ if isIgnoredXid(xid, nil) {
+ t.Errorf("Unknown XID %d should not be ignored", xid)
+ }
+ }
+}
+
+func TestIsIgnoredXid_BoundaryValues(t *testing.T) {
+ // Boundary values should not be ignored
+ if isIgnoredXid(0, nil) {
+ t.Error("XID 0 should not be ignored")
+ }
+
+ if isIgnoredXid(math.MaxUint64, nil) {
+ t.Error("XID MaxUint64 should not be ignored")
+ }
+}
+
+func TestIsCriticalXid(t *testing.T) {
+ tests := []struct {
+ xid uint64
+ expected bool
+ }{
+ // Critical XIDs
+ {48, true},
+ {63, true},
+ {64, true},
+ {74, true},
+ {79, true},
+ {94, true},
+ {95, true},
+ {119, true},
+ {120, true},
+
+ // Non-critical XIDs
+ {13, false},
+ {31, false},
+ {43, false},
+ {1, false},
+ {999, false},
+
+ // Boundary values
+ {0, false},
+ {math.MaxUint64, false},
+ }
+
+ for _, tt := range tests {
+ result := IsCriticalXid(tt.xid)
+ if result != tt.expected {
+ t.Errorf("IsCriticalXid(%d) = %v, want %v", tt.xid, result, tt.expected)
+ }
+ }
+}
+
+func TestXidToString(t *testing.T) {
+ tests := []struct {
+ xid uint64
+ expected string
+ }{
+ {13, "Graphics Engine Exception"},
+ {31, "GPU memory page fault"},
+ {48, "Double Bit ECC Error"},
+ {79, "GPU has fallen off the bus"},
+ {109, "Context Switch Timeout"},
+ {999, "Unknown XID"},
+ {0, "Unknown XID"},
+ }
+
+ for _, tt := range tests {
+ result := xidToString(tt.xid)
+ if result != tt.expected {
+ t.Errorf("xidToString(%d) = %q, want %q", tt.xid, result, tt.expected)
+ }
+ }
+}
+
+func TestParseIgnoredXids(t *testing.T) {
+ tests := []struct {
+ name string
+ input string
+ expected []uint64
+ }{
+ {
+ name: "empty string",
+ input: "",
+ expected: nil,
+ },
+ {
+ name: "single value",
+ input: "48",
+ expected: []uint64{48},
+ },
+ {
+ name: "multiple comma separated",
+ input: "48,63,64",
+ expected: []uint64{48, 63, 64},
+ },
+ {
+ name: "with spaces",
+ input: "48, 63, 64",
+ expected: []uint64{48, 63, 64},
+ },
+ {
+ name: "space separated",
+ input: "48 63 64",
+ expected: []uint64{48, 63, 64},
+ },
+ {
+ name: "mixed separators",
+ input: "48, 63 64,65",
+ expected: []uint64{48, 63, 64, 65},
+ },
+ {
+ name: "trailing comma",
+ input: "48,63,",
+ expected: []uint64{48, 63},
+ },
+ {
+ name: "leading comma",
+ input: ",48,63",
+ expected: []uint64{48, 63},
+ },
+ {
+ name: "non-numeric characters mixed in",
+ input: "4a8,63",
+ expected: []uint64{63},
+ },
+ {
+ name: "completely non-numeric",
+ input: "abc",
+ expected: nil,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ result := ParseIgnoredXids(tt.input)
+
+ if len(result) != len(tt.expected) {
+ t.Errorf("ParseIgnoredXids(%q) len = %d, want %d", tt.input, len(result), len(tt.expected))
+ return
+ }
+
+ for i, v := range result {
+ if v != tt.expected[i] {
+ t.Errorf("ParseIgnoredXids(%q)[%d] = %d, want %d", tt.input, i, v, tt.expected[i])
+ }
+ }
+ })
+ }
+}
+
+func TestGetXidSeverity(t *testing.T) {
+ tests := []struct {
+ xid uint64
+ expected XidSeverity
+ }{
+ // Ignored (application errors)
+ {13, XidSeverityIgnored},
+ {31, XidSeverityIgnored},
+ {43, XidSeverityIgnored},
+ {45, XidSeverityIgnored},
+ {68, XidSeverityIgnored},
+ {109, XidSeverityIgnored},
+
+ // Critical (hardware failures)
+ {48, XidSeverityCritical},
+ {63, XidSeverityCritical},
+ {64, XidSeverityCritical},
+ {74, XidSeverityCritical},
+ {79, XidSeverityCritical},
+ {94, XidSeverityCritical},
+ {95, XidSeverityCritical},
+ {119, XidSeverityCritical},
+ {120, XidSeverityCritical},
+
+ // Warning (unknown XIDs)
+ {1, XidSeverityWarning},
+ {2, XidSeverityWarning},
+ {999, XidSeverityWarning},
+
+ // Boundary values
+ {0, XidSeverityWarning},
+ {math.MaxUint64, XidSeverityWarning},
+ }
+
+ for _, tt := range tests {
+ result := GetXidSeverity(tt.xid)
+ if result != tt.expected {
+ t.Errorf("GetXidSeverity(%d) = %v, want %v", tt.xid, result, tt.expected)
+ }
+ }
+}
+
+func TestXidSeverity_String(t *testing.T) {
+ tests := []struct {
+ severity XidSeverity
+ expected string
+ }{
+ {XidSeverityUnknown, "unknown"},
+ {XidSeverityIgnored, "ignored"},
+ {XidSeverityWarning, "warning"},
+ {XidSeverityCritical, "critical"},
+ {XidSeverity(99), "unknown"}, // Invalid severity
+ }
+
+ for _, tt := range tests {
+ result := tt.severity.String()
+ if result != tt.expected {
+ t.Errorf("XidSeverity(%d).String() = %q, want %q", tt.severity, result, tt.expected)
+ }
+ }
+}
diff --git a/pkg/services/device/v1alpha1/gpu_provider.go b/pkg/services/device/v1alpha1/gpu_provider.go
index 32dc779bd..7f11c98e7 100644
--- a/pkg/services/device/v1alpha1/gpu_provider.go
+++ b/pkg/services/device/v1alpha1/gpu_provider.go
@@ -1,81 +1,76 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
//
-// http://www.apache.org/licenses/LICENSE-2.0
+// http://www.apache.org/licenses/LICENSE-2.0
//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by service-gen. DO NOT EDIT.
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
package v1alpha1
import (
"fmt"
- "path"
devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1"
pb "github.com/nvidia/nvsentinel/internal/generated/device/v1alpha1"
"github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/api"
"github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/registry"
+ "github.com/nvidia/nvsentinel/pkg/storage/memory"
"google.golang.org/grpc"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/runtime/serializer"
"k8s.io/apiserver/pkg/storage/storagebackend"
- "k8s.io/apiserver/pkg/storage/storagebackend/factory"
)
func init() {
registry.Register(NewGPUServiceProvider())
}
-type gpuServiceProvider struct{
+type gpuServiceProvider struct {
groupVersion schema.GroupVersion
}
+// NewGPUServiceProvider returns a ServiceProvider that installs the GPU gRPC service.
func NewGPUServiceProvider() api.ServiceProvider {
return &gpuServiceProvider{
groupVersion: devicev1alpha1.SchemeGroupVersion,
}
}
-func (p *gpuServiceProvider) Install(svr *grpc.Server, storageConfig storagebackend.Config) (api.Service, error) {
+// Install creates the in-memory storage backend and registers the GPU service
+// on the provided gRPC server.
+func (p *gpuServiceProvider) Install(svr *grpc.Server, cfg storagebackend.Config) (api.Service, error) {
+ // Currently only in-memory storage is supported. The cfg parameter is
+ // accepted for future extensibility but not used for backend selection.
+ _ = cfg
+
gv := p.groupVersion.String()
scheme := runtime.NewScheme()
if err := devicev1alpha1.AddToScheme(scheme); err != nil {
return nil, fmt.Errorf("failed to add %q to scheme: %w", gv, err)
}
-
- codecs := serializer.NewCodecFactory(scheme)
- codec := codecs.LegacyCodec(p.groupVersion)
- configForResource := storagebackend.ConfigForResource{
- Config: storageConfig,
+ codecs := serializer.NewCodecFactory(scheme)
+ info, ok := runtime.SerializerInfoForMediaType(codecs.SupportedMediaTypes(), runtime.ContentTypeJSON)
+ if !ok {
+ return nil, fmt.Errorf("no serializer found for %s in %s", runtime.ContentTypeJSON, gv)
}
- configForResource.Config.Codec = codec
-
- resourcePrefix := path.Join("/", p.groupVersion.Group, "gpus")
+ codec := codecs.CodecForVersions(info.Serializer, info.Serializer, schema.GroupVersions{p.groupVersion}, schema.GroupVersions{p.groupVersion})
- s, destroyFunc, err := factory.Create(
- configForResource,
- func() runtime.Object { return &devicev1alpha1.GPU{} },
- func() runtime.Object { return &devicev1alpha1.GPUList{} },
- resourcePrefix,
- )
+ s, destroyFunc, err := memory.CreateStorage(codec)
if err != nil {
- return nil, fmt.Errorf("failed to create storage for %s: %w", resourcePrefix, err)
+ return nil, fmt.Errorf("failed to create in-memory storage for %s: %w", gv, err)
}
service := NewGPUService(s, destroyFunc)
-
pb.RegisterGpuServiceServer(svr, service)
return service, nil
diff --git a/pkg/services/device/v1alpha1/gpu_service.go b/pkg/services/device/v1alpha1/gpu_service.go
index 3bff930d5..f4434ef6d 100644
--- a/pkg/services/device/v1alpha1/gpu_service.go
+++ b/pkg/services/device/v1alpha1/gpu_service.go
@@ -1,18 +1,16 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
//
-// http://www.apache.org/licenses/LICENSE-2.0
+// http://www.apache.org/licenses/LICENSE-2.0
//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by service-gen. DO NOT EDIT.
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
package v1alpha1
@@ -21,9 +19,11 @@ import (
"fmt"
"path"
"reflect"
+ "regexp"
devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1"
pb "github.com/nvidia/nvsentinel/internal/generated/device/v1alpha1"
+ "google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"google.golang.org/protobuf/types/known/emptypb"
@@ -38,21 +38,24 @@ import (
type gpuService struct {
pb.UnimplementedGpuServiceServer
- storage storage.Interface
+ storage storage.Interface
destroyFunc factory.DestroyFunc
}
+// NewGPUService creates a new GPU gRPC service backed by the provided storage.
func NewGPUService(storage storage.Interface, destroyFunc factory.DestroyFunc) *gpuService {
return &gpuService{
- storage: storage,
+ storage: storage,
destroyFunc: destroyFunc,
}
}
+// Name returns the fully qualified gRPC service name.
func (s *gpuService) Name() string {
return pb.GpuService_ServiceDesc.ServiceName
}
+// IsReady reports whether the underlying storage backend is healthy.
func (s *gpuService) IsReady() bool {
if s.storage == nil {
return false
@@ -60,6 +63,7 @@ func (s *gpuService) IsReady() bool {
return s.storage.ReadinessCheck() == nil
}
+// Cleanup shuts down the storage backend.
func (s *gpuService) Cleanup() {
if s.destroyFunc != nil {
klog.V(2).InfoS("Shutting down storage backend", "service", s.Name())
@@ -67,22 +71,67 @@ func (s *gpuService) Cleanup() {
}
}
-func (s *gpuService) storageKey(ns string, name string) string {
- base := path.Join("/", devicev1alpha1.SchemeGroupVersion.Group, "gpus")
+// normalizeNamespace returns "default" if ns is empty.
+func normalizeNamespace(ns string) string {
+ if ns == "" {
+ return "default"
+ }
+ return ns
+}
+
+// validateNamespace checks that ns does not exceed the K8s maximum namespace length.
+// An empty namespace is valid (it defaults to "default" elsewhere).
+func validateNamespace(ns string) error {
+ if ns == "" {
+ return nil
+ }
+ if len(ns) > 253 { // K8s namespace max length
+ return status.Error(codes.InvalidArgument, "namespace exceeds maximum length of 253 characters")
+ }
+ return nil
+}
+
+// gpuUUIDPattern matches NVIDIA GPU UUIDs
+// (e.g., GPU-12345678-1234-1234-1234-123456789abc).
+var gpuUUIDPattern = regexp.MustCompile(
+ `^GPU-[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-` +
+ `[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$`,
+)
+
+// validateGPUName checks that name is non-empty and matches
+// the NVIDIA GPU UUID format.
+func validateGPUName(name string) error {
+ if name == "" {
+ return status.Error(codes.InvalidArgument, "name is required")
+ }
- if ns == "" && name != "" {
- ns = "default"
+ if !gpuUUIDPattern.MatchString(name) {
+ return status.Errorf(codes.InvalidArgument,
+ "name must be a valid GPU UUID "+
+ "(GPU-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx), got %q",
+ name)
}
- // Pattern: /device.nvidia.com/gpus//
+ return nil
+}
+
+func (s *gpuService) storageKey(ns string, name string) string {
+ base := path.Join("/", devicev1alpha1.SchemeGroupVersion.Group, "gpus")
+ if name != "" {
+ ns = normalizeNamespace(ns)
+ }
return path.Join(base, ns, name)
}
+// GetGpu retrieves a single GPU resource.
func (s *gpuService) GetGpu(ctx context.Context, req *pb.GetGpuRequest) (*pb.GetGpuResponse, error) {
logger := klog.FromContext(ctx)
- if req.GetName() == "" {
- return nil, status.Error(codes.InvalidArgument, "name is required")
+ if err := validateGPUName(req.GetName()); err != nil {
+ return nil, err
+ }
+ if err := validateNamespace(req.GetNamespace()); err != nil {
+ return nil, err
}
key := s.storageKey(req.GetNamespace(), req.GetName())
@@ -106,15 +155,20 @@ func (s *gpuService) GetGpu(ctx context.Context, req *pb.GetGpuRequest) (*pb.Get
}, nil
}
+// ListGpus retrieves a list of GPU resources.
func (s *gpuService) ListGpus(ctx context.Context, req *pb.ListGpusRequest) (*pb.ListGpusResponse, error) {
logger := klog.FromContext(ctx)
+ if err := validateNamespace(req.GetNamespace()); err != nil {
+ return nil, err
+ }
+
var gpus devicev1alpha1.GPUList
opts := storage.ListOptions{
ResourceVersion: req.GetOpts().GetResourceVersion(),
Recursive: true,
- Predicate: storage.Everything, // TODO: selection predicate
+ Predicate: storage.Everything,
}
key := s.storageKey(req.GetNamespace(), "")
@@ -125,7 +179,6 @@ func (s *gpuService) ListGpus(ctx context.Context, req *pb.ListGpusRequest) (*pb
if rv == 0 {
rvStr = req.GetOpts().GetResourceVersion()
}
-
return &pb.ListGpusResponse{
GpuList: &pb.GpuList{
Metadata: &pb.ListMeta{
@@ -150,7 +203,8 @@ func (s *gpuService) ListGpus(ctx context.Context, req *pb.ListGpusRequest) (*pb
}, nil
}
-func (s *gpuService) WatchGpus(req *pb.WatchGpusRequest, stream pb.GpuService_WatchGpusServer) error {
+// WatchGpus streams lifecycle events for GPU resources.
+func (s *gpuService) WatchGpus(req *pb.WatchGpusRequest, stream grpc.ServerStreamingServer[pb.WatchGpusResponse]) error {
ctx := stream.Context()
logger := klog.FromContext(ctx)
@@ -159,9 +213,9 @@ func (s *gpuService) WatchGpus(req *pb.WatchGpusRequest, stream pb.GpuService_Wa
key := s.storageKey(req.GetNamespace(), "")
w, err := s.storage.Watch(ctx, key, storage.ListOptions{
- ResourceVersion: req.GetOpts().GetResourceVersion(),
+ ResourceVersion: rv,
Recursive: true,
- Predicate: storage.Everything, // TODO: selection predicate
+ Predicate: storage.Everything,
})
if err != nil {
if storage.IsInvalidError(err) {
@@ -226,25 +280,25 @@ func (s *gpuService) WatchGpus(req *pb.WatchGpusRequest, stream pb.GpuService_Wa
}
}
+// CreateGpu creates a single GPU resource.
func (s *gpuService) CreateGpu(ctx context.Context, req *pb.CreateGpuRequest) (*pb.Gpu, error) {
logger := klog.FromContext(ctx)
if req.GetGpu() == nil {
return nil, status.Error(codes.InvalidArgument, "resource body is required")
}
- if req.GetGpu().GetMetadata() == nil || req.GetGpu().GetMetadata().GetName() == "" {
+ if req.GetGpu().GetMetadata() == nil {
return nil, status.Error(codes.InvalidArgument, "metadata.name: Required value")
}
+ if err := validateGPUName(req.GetGpu().GetMetadata().GetName()); err != nil {
+ return nil, err
+ }
name := req.GetGpu().GetMetadata().GetName()
- ns := req.GetGpu().GetMetadata().GetNamespace()
- if ns == "" {
- ns = "default"
- }
+ ns := normalizeNamespace(req.GetGpu().GetMetadata().GetNamespace())
key := s.storageKey(ns, name)
gpu := devicev1alpha1.FromProto(req.Gpu)
- // TODO: move into PrepareForCreate function?
gpu.SetNamespace(ns)
gpu.SetUID(uuid.NewUUID())
now := metav1.Now()
@@ -270,15 +324,19 @@ func (s *gpuService) CreateGpu(ctx context.Context, req *pb.CreateGpuRequest) (*
return devicev1alpha1.ToProto(out), nil
}
+// UpdateGpu updates a single GPU resource (spec only).
func (s *gpuService) UpdateGpu(ctx context.Context, req *pb.UpdateGpuRequest) (*pb.Gpu, error) {
logger := klog.FromContext(ctx)
if req.GetGpu() == nil {
return nil, status.Error(codes.InvalidArgument, "resource body is required")
}
- if req.GetGpu().GetMetadata() == nil || req.GetGpu().GetMetadata().GetName() == "" {
+ if req.GetGpu().GetMetadata() == nil {
return nil, status.Error(codes.InvalidArgument, "metadata.name: Required value")
}
+ if err := validateGPUName(req.GetGpu().GetMetadata().GetName()); err != nil {
+ return nil, err
+ }
name := req.GetGpu().GetMetadata().GetName()
ns := req.GetGpu().GetMetadata().GetNamespace()
@@ -289,8 +347,8 @@ func (s *gpuService) UpdateGpu(ctx context.Context, req *pb.UpdateGpuRequest) (*
ctx,
key,
updatedGpu,
- false, // ignoreNotFound
- nil, // TODO: preconditions
+ false,
+ nil,
func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) {
curr := input.(*devicev1alpha1.GPU)
incoming := devicev1alpha1.FromProto(req.GetGpu())
@@ -319,7 +377,7 @@ func (s *gpuService) UpdateGpu(ctx context.Context, req *pb.UpdateGpuRequest) (*
return clone, nil, nil
},
- nil, // TODO: cachedExistingObject
+ nil,
)
if err != nil {
@@ -345,11 +403,76 @@ func (s *gpuService) UpdateGpu(ctx context.Context, req *pb.UpdateGpuRequest) (*
return devicev1alpha1.ToProto(updatedGpu), nil
}
+// UpdateGpuStatus updates only the status subresource of a GPU.
+func (s *gpuService) UpdateGpuStatus(ctx context.Context, req *pb.UpdateGpuStatusRequest) (*pb.Gpu, error) {
+ logger := klog.FromContext(ctx)
+
+ if req.GetGpu() == nil {
+ return nil, status.Error(codes.InvalidArgument, "resource body is required")
+ }
+ if req.GetGpu().GetMetadata() == nil {
+ return nil, status.Error(codes.InvalidArgument, "metadata.name: Required value")
+ }
+ if err := validateGPUName(req.GetGpu().GetMetadata().GetName()); err != nil {
+ return nil, err
+ }
+ if req.GetGpu().GetStatus() == nil {
+ return nil, status.Error(codes.InvalidArgument, "status is required")
+ }
+
+ name := req.GetGpu().GetMetadata().GetName()
+ ns := req.GetGpu().GetMetadata().GetNamespace()
+ key := s.storageKey(ns, name)
+ updatedGpu := &devicev1alpha1.GPU{}
+
+ err := s.storage.GuaranteedUpdate(
+ ctx,
+ key,
+ updatedGpu,
+ false,
+ nil,
+ func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) {
+ curr := input.(*devicev1alpha1.GPU)
+ incoming := devicev1alpha1.FromProto(req.GetGpu())
+
+ if incoming.ResourceVersion != "" && incoming.ResourceVersion != curr.ResourceVersion {
+ return nil, nil, storage.NewResourceVersionConflictsError(key, 0)
+ }
+
+ clone := curr.DeepCopy()
+ clone.Status = incoming.Status
+
+ return clone, nil, nil
+ },
+ nil,
+ )
+
+ if err != nil {
+ if storage.IsNotFound(err) {
+ return nil, status.Errorf(codes.NotFound, "GPU %q not found", name)
+ }
+ if storage.IsConflict(err) {
+ return nil, status.Errorf(codes.Aborted,
+ "operation cannot be fulfilled on GPUs %q: the object has been modified", name)
+ }
+ logger.Error(err, "failed to update GPU status", "name", name, "namespace", ns)
+ return nil, status.Error(codes.Internal, "internal server error")
+ }
+
+ logger.V(2).Info("Successfully updated GPU status", "name", name, "namespace", ns, "resourceVersion", updatedGpu.ResourceVersion)
+
+ return devicev1alpha1.ToProto(updatedGpu), nil
+}
+
+// DeleteGpu deletes a single GPU resource.
func (s *gpuService) DeleteGpu(ctx context.Context, req *pb.DeleteGpuRequest) (*emptypb.Empty, error) {
logger := klog.FromContext(ctx)
- if req.GetName() == "" {
- return nil, status.Error(codes.InvalidArgument, "name is required")
+ if err := validateGPUName(req.GetName()); err != nil {
+ return nil, err
+ }
+ if err := validateNamespace(req.GetNamespace()); err != nil {
+ return nil, err
}
name := req.GetName()
@@ -361,10 +484,10 @@ func (s *gpuService) DeleteGpu(ctx context.Context, req *pb.DeleteGpuRequest) (*
ctx,
key,
out,
- nil, // TODO: preconditions (e.g., rv check)
+ nil,
storage.ValidateAllObjectFunc,
- nil, // TODO: cachedExistingObject
- storage.DeleteOptions{}, // TODO: DeleteOptions
+ nil,
+ storage.DeleteOptions{},
); err != nil {
if storage.IsNotFound(err) {
return nil, status.Errorf(codes.NotFound, "GPU %q not found", name)
diff --git a/pkg/services/device/v1alpha1/gpu_service_test.go b/pkg/services/device/v1alpha1/gpu_service_test.go
new file mode 100644
index 000000000..184869c97
--- /dev/null
+++ b/pkg/services/device/v1alpha1/gpu_service_test.go
@@ -0,0 +1,494 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package v1alpha1
+
+import (
+ "context"
+ "strings"
+ "testing"
+
+ devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1"
+ pb "github.com/nvidia/nvsentinel/internal/generated/device/v1alpha1"
+ "github.com/nvidia/nvsentinel/pkg/storage/memory"
+ "google.golang.org/grpc/codes"
+ "google.golang.org/grpc/status"
+ "k8s.io/apimachinery/pkg/runtime"
+ "k8s.io/apimachinery/pkg/runtime/schema"
+ "k8s.io/apimachinery/pkg/runtime/serializer"
+)
+
+func newTestService(t *testing.T) *gpuService {
+ t.Helper()
+
+ scheme := runtime.NewScheme()
+ if err := devicev1alpha1.AddToScheme(scheme); err != nil {
+ t.Fatal(err)
+ }
+
+ codecs := serializer.NewCodecFactory(scheme)
+ gv := devicev1alpha1.SchemeGroupVersion
+ info, _ := runtime.SerializerInfoForMediaType(codecs.SupportedMediaTypes(), runtime.ContentTypeJSON)
+ codec := codecs.CodecForVersions(info.Serializer, info.Serializer, schema.GroupVersions{gv}, schema.GroupVersions{gv})
+
+ s, destroy, err := memory.CreateStorage(codec)
+ if err != nil {
+ t.Fatal(err)
+ }
+ t.Cleanup(destroy)
+
+ return NewGPUService(s, destroy)
+}
+
+func createTestGpu(t *testing.T, svc *gpuService, name string) *pb.Gpu {
+ t.Helper()
+
+ gpu, err := svc.CreateGpu(context.Background(), &pb.CreateGpuRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: name,
+ Namespace: "default",
+ },
+ Spec: &pb.GpuSpec{
+ Uuid: name,
+ },
+ },
+ })
+ if err != nil {
+ t.Fatalf("failed to create GPU %q: %v", name, err)
+ }
+
+ return gpu
+}
+
+func TestGPUService_CreateAndGet(t *testing.T) {
+ svc := newTestService(t)
+ ctx := context.Background()
+
+ const gpuName = "GPU-00000000-0000-0000-0000-000000000000"
+ created := createTestGpu(t, svc, gpuName)
+
+ if created.GetMetadata().GetName() != gpuName {
+ t.Errorf("expected name %q, got %q", gpuName, created.GetMetadata().GetName())
+ }
+ if created.GetMetadata().GetUid() == "" {
+ t.Error("expected UID to be set on created GPU")
+ }
+
+ resp, err := svc.GetGpu(ctx, &pb.GetGpuRequest{
+ Name: gpuName,
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Fatalf("GetGpu failed: %v", err)
+ }
+
+ got := resp.GetGpu()
+ if got.GetMetadata().GetName() != gpuName {
+ t.Errorf("expected name %q, got %q", gpuName, got.GetMetadata().GetName())
+ }
+ if got.GetMetadata().GetUid() != created.GetMetadata().GetUid() {
+ t.Errorf("UID mismatch: expected %q, got %q",
+ created.GetMetadata().GetUid(), got.GetMetadata().GetUid())
+ }
+}
+
+func TestGPUService_CreateDuplicate(t *testing.T) {
+ svc := newTestService(t)
+
+ const gpuName = "GPU-11111111-1111-1111-1111-111111111111"
+ createTestGpu(t, svc, gpuName)
+
+ _, err := svc.CreateGpu(context.Background(), &pb.CreateGpuRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ },
+ Spec: &pb.GpuSpec{
+ Uuid: gpuName,
+ },
+ },
+ })
+ if err == nil {
+ t.Fatal("expected error for duplicate create, got nil")
+ }
+
+ st, ok := status.FromError(err)
+ if !ok {
+ t.Fatalf("expected gRPC status error, got %T: %v", err, err)
+ }
+ if st.Code() != codes.AlreadyExists {
+ t.Errorf("expected code %v, got %v: %s", codes.AlreadyExists, st.Code(), st.Message())
+ }
+}
+
+func TestGPUService_List(t *testing.T) {
+ svc := newTestService(t)
+ ctx := context.Background()
+
+ createTestGpu(t, svc, "GPU-aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa")
+ createTestGpu(t, svc, "GPU-bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb")
+
+ resp, err := svc.ListGpus(ctx, &pb.ListGpusRequest{
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Fatalf("ListGpus failed: %v", err)
+ }
+
+ count := len(resp.GetGpuList().GetItems())
+ if count != 2 {
+ t.Errorf("expected 2 GPUs, got %d", count)
+ }
+}
+
+func TestGPUService_Delete(t *testing.T) {
+ svc := newTestService(t)
+ ctx := context.Background()
+
+ const gpuName = "GPU-22222222-2222-2222-2222-222222222222"
+ createTestGpu(t, svc, gpuName)
+
+ _, err := svc.DeleteGpu(ctx, &pb.DeleteGpuRequest{
+ Name: gpuName,
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Fatalf("DeleteGpu failed: %v", err)
+ }
+
+ _, err = svc.GetGpu(ctx, &pb.GetGpuRequest{
+ Name: gpuName,
+ Namespace: "default",
+ })
+ if err == nil {
+ t.Fatal("expected NotFound after delete, got nil")
+ }
+
+ st, ok := status.FromError(err)
+ if !ok {
+ t.Fatalf("expected gRPC status error, got %T: %v", err, err)
+ }
+ if st.Code() != codes.NotFound {
+ t.Errorf("expected code %v, got %v: %s", codes.NotFound, st.Code(), st.Message())
+ }
+}
+
+func TestGPUService_DeleteNotFound(t *testing.T) {
+ svc := newTestService(t)
+
+ _, err := svc.DeleteGpu(context.Background(), &pb.DeleteGpuRequest{
+ Name: "GPU-ffffffff-ffff-ffff-ffff-ffffffffffff",
+ Namespace: "default",
+ })
+ if err == nil {
+ t.Fatal("expected NotFound error, got nil")
+ }
+
+ st, ok := status.FromError(err)
+ if !ok {
+ t.Fatalf("expected gRPC status error, got %T: %v", err, err)
+ }
+ if st.Code() != codes.NotFound {
+ t.Errorf("expected code %v, got %v: %s", codes.NotFound, st.Code(), st.Message())
+ }
+}
+
+func TestGPUService_Update(t *testing.T) {
+ svc := newTestService(t)
+ ctx := context.Background()
+
+ const gpuName = "GPU-33333333-3333-3333-3333-333333333333"
+ created := createTestGpu(t, svc, gpuName)
+
+ updated, err := svc.UpdateGpu(ctx, &pb.UpdateGpuRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ },
+ Spec: &pb.GpuSpec{
+ Uuid: "GPU-new-uuid",
+ },
+ },
+ })
+ if err != nil {
+ t.Fatalf("UpdateGpu failed: %v", err)
+ }
+
+ if updated.GetSpec().GetUuid() != "GPU-new-uuid" {
+ t.Errorf("expected spec.uuid %q, got %q", "GPU-new-uuid", updated.GetSpec().GetUuid())
+ }
+ if updated.GetMetadata().GetGeneration() != created.GetMetadata().GetGeneration()+1 {
+ t.Errorf("expected generation %d, got %d",
+ created.GetMetadata().GetGeneration()+1, updated.GetMetadata().GetGeneration())
+ }
+}
+
+func TestGPUService_UpdateStatus(t *testing.T) {
+ svc := newTestService(t)
+ ctx := context.Background()
+
+ const gpuName = "GPU-44444444-4444-4444-4444-444444444444"
+ created := createTestGpu(t, svc, gpuName)
+
+ updated, err := svc.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ },
+ Status: &pb.GpuStatus{
+ RecommendedAction: "drain",
+ },
+ },
+ })
+ if err != nil {
+ t.Fatalf("UpdateGpuStatus failed: %v", err)
+ }
+
+ if updated.GetStatus().GetRecommendedAction() != "drain" {
+ t.Errorf("expected recommended action %q, got %q",
+ "drain", updated.GetStatus().GetRecommendedAction())
+ }
+
+ // Generation must NOT change on status-only updates.
+ if updated.GetMetadata().GetGeneration() != created.GetMetadata().GetGeneration() {
+ t.Errorf("expected generation %d (unchanged), got %d",
+ created.GetMetadata().GetGeneration(), updated.GetMetadata().GetGeneration())
+ }
+}
+
+func TestGPUService_UpdateStatus_StaleResourceVersion(t *testing.T) {
+ svc := newTestService(t)
+ ctx := context.Background()
+
+ const gpuName = "GPU-55555555-5555-5555-5555-555555555555"
+ created := createTestGpu(t, svc, gpuName)
+ staleRV := created.GetMetadata().GetResourceVersion()
+
+ // Update spec to increment the resource version.
+ _, err := svc.UpdateGpu(ctx, &pb.UpdateGpuRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ },
+ Spec: &pb.GpuSpec{
+ Uuid: "GPU-updated-uuid",
+ },
+ },
+ })
+ if err != nil {
+ t.Fatalf("UpdateGpu failed: %v", err)
+ }
+
+ // Attempt status update with the stale resource version.
+ _, err = svc.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ ResourceVersion: staleRV,
+ },
+ Status: &pb.GpuStatus{
+ RecommendedAction: "drain",
+ },
+ },
+ })
+ if err == nil {
+ t.Fatal("expected error for stale resource version, got nil")
+ }
+
+ st, ok := status.FromError(err)
+ if !ok {
+ t.Fatalf("expected gRPC status error, got %T: %v", err, err)
+ }
+ if st.Code() != codes.Aborted {
+ t.Errorf("expected code %v, got %v: %s", codes.Aborted, st.Code(), st.Message())
+ }
+}
+
+func TestGPUService_UpdateStatus_NilStatus(t *testing.T) {
+ svc := newTestService(t)
+ ctx := context.Background()
+
+ const gpuName = "GPU-66666666-6666-6666-6666-666666666666"
+ createTestGpu(t, svc, gpuName)
+
+ _, err := svc.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ },
+ Status: nil,
+ },
+ })
+ if err == nil {
+ t.Fatal("expected error for nil status, got nil")
+ }
+
+ st, ok := status.FromError(err)
+ if !ok {
+ t.Fatalf("expected gRPC status error, got %T: %v", err, err)
+ }
+ if st.Code() != codes.InvalidArgument {
+ t.Errorf("expected code %v, got %v: %s", codes.InvalidArgument, st.Code(), st.Message())
+ }
+}
+
+func TestGPUService_UpdateStatus_EmptyConditions(t *testing.T) {
+ svc := newTestService(t)
+ ctx := context.Background()
+
+ const gpuName = "GPU-77777777-7777-7777-7777-777777777777"
+ createTestGpu(t, svc, gpuName)
+
+ // First set a condition.
+ _, err := svc.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ },
+ Status: &pb.GpuStatus{
+ Conditions: []*pb.Condition{
+ {
+ Type: "Ready",
+ Status: "True",
+ Reason: "TestReason",
+ },
+ },
+ RecommendedAction: "drain",
+ },
+ },
+ })
+ if err != nil {
+ t.Fatalf("UpdateGpuStatus (set condition) failed: %v", err)
+ }
+
+ // Now update with empty conditions to clear them.
+ updated, err := svc.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ },
+ Status: &pb.GpuStatus{
+ Conditions: []*pb.Condition{},
+ RecommendedAction: "none",
+ },
+ },
+ })
+ if err != nil {
+ t.Fatalf("UpdateGpuStatus (clear conditions) failed: %v", err)
+ }
+
+ if len(updated.GetStatus().GetConditions()) != 0 {
+ t.Errorf("expected 0 conditions after clearing, got %d", len(updated.GetStatus().GetConditions()))
+ }
+ if updated.GetStatus().GetRecommendedAction() != "none" {
+ t.Errorf("expected recommended action %q, got %q", "none", updated.GetStatus().GetRecommendedAction())
+ }
+}
+
+func TestGPUService_CreateValidation(t *testing.T) {
+ svc := newTestService(t)
+
+ tests := []struct {
+ name string
+ req *pb.CreateGpuRequest
+ }{
+ {
+ name: "nil gpu body",
+ req: &pb.CreateGpuRequest{},
+ },
+ {
+ name: "nil metadata",
+ req: &pb.CreateGpuRequest{
+ Gpu: &pb.Gpu{
+ Spec: &pb.GpuSpec{Uuid: "GPU-test"},
+ },
+ },
+ },
+ {
+ name: "empty name",
+ req: &pb.CreateGpuRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{Name: ""},
+ Spec: &pb.GpuSpec{Uuid: "GPU-test"},
+ },
+ },
+ },
+ {
+ name: "invalid GPU UUID format",
+ req: &pb.CreateGpuRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{Name: "not-a-gpu-uuid"},
+ Spec: &pb.GpuSpec{Uuid: "GPU-test"},
+ },
+ },
+ },
+ {
+ name: "path traversal in name",
+ req: &pb.CreateGpuRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{Name: "../../etc/passwd"},
+ Spec: &pb.GpuSpec{Uuid: "GPU-test"},
+ },
+ },
+ },
+ }
+
+ for _, tc := range tests {
+ t.Run(tc.name, func(t *testing.T) {
+ _, err := svc.CreateGpu(context.Background(), tc.req)
+ if err == nil {
+ t.Fatal("expected InvalidArgument error, got nil")
+ }
+
+ st, ok := status.FromError(err)
+ if !ok {
+ t.Fatalf("expected gRPC status error, got %T: %v", err, err)
+ }
+ if st.Code() != codes.InvalidArgument {
+ t.Errorf("expected code %v, got %v: %s", codes.InvalidArgument, st.Code(), st.Message())
+ }
+ })
+ }
+}
+
+func TestGPUService_NamespaceValidation(t *testing.T) {
+ svc := newTestService(t)
+ ctx := context.Background()
+
+ longNS := strings.Repeat("a", 254)
+
+ _, err := svc.GetGpu(ctx, &pb.GetGpuRequest{
+ Name: "GPU-00000000-0000-0000-0000-000000000000",
+ Namespace: longNS,
+ })
+ if err == nil {
+ t.Fatal("expected InvalidArgument for long namespace, got nil")
+ }
+ st, ok := status.FromError(err)
+ if !ok {
+ t.Fatalf("expected gRPC status error, got %T: %v", err, err)
+ }
+ if st.Code() != codes.InvalidArgument {
+ t.Errorf("expected code %v, got %v: %s", codes.InvalidArgument, st.Code(), st.Message())
+ }
+}
diff --git a/pkg/services/device/v1alpha1/integration_test.go b/pkg/services/device/v1alpha1/integration_test.go
new file mode 100644
index 000000000..f84344575
--- /dev/null
+++ b/pkg/services/device/v1alpha1/integration_test.go
@@ -0,0 +1,408 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package v1alpha1_test
+
+import (
+ "io"
+ "testing"
+ "time"
+
+ pb "github.com/nvidia/nvsentinel/internal/generated/device/v1alpha1"
+ "github.com/nvidia/nvsentinel/pkg/testutil"
+ "google.golang.org/grpc/codes"
+ "google.golang.org/grpc/status"
+ "google.golang.org/protobuf/types/known/timestamppb"
+)
+
+// TestIntegration_CRUD performs a full Create→Get→List→Update→Delete cycle over gRPC.
+func TestIntegration_CRUD(t *testing.T) {
+ client := testutil.NewTestGPUClient(t)
+ ctx := t.Context()
+
+ const gpuName = "GPU-12345678-1234-1234-1234-123456789abc"
+
+ // Create a GPU
+ created, err := client.CreateGpu(ctx, &pb.CreateGpuRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ },
+ Spec: &pb.GpuSpec{
+ Uuid: "GPU-1234",
+ },
+ },
+ })
+ if err != nil {
+ t.Fatalf("CreateGpu failed: %v", err)
+ }
+
+ if created.GetMetadata().GetName() != gpuName {
+ t.Errorf("expected name %q, got %q", gpuName, created.GetMetadata().GetName())
+ }
+ if created.GetSpec().GetUuid() != "GPU-1234" {
+ t.Errorf("expected UUID %q, got %q", "GPU-1234", created.GetSpec().GetUuid())
+ }
+ if created.GetMetadata().GetUid() == "" {
+ t.Error("expected UID to be set")
+ }
+
+ // Get it back
+ getResp, err := client.GetGpu(ctx, &pb.GetGpuRequest{
+ Name: gpuName,
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Fatalf("GetGpu failed: %v", err)
+ }
+
+ got := getResp.GetGpu()
+ if got.GetSpec().GetUuid() != "GPU-1234" {
+ t.Errorf("expected UUID %q, got %q", "GPU-1234", got.GetSpec().GetUuid())
+ }
+
+ // List namespace "default"
+ listResp, err := client.ListGpus(ctx, &pb.ListGpusRequest{
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Fatalf("ListGpus failed: %v", err)
+ }
+
+ if len(listResp.GetGpuList().GetItems()) != 1 {
+ t.Errorf("expected 1 GPU, got %d", len(listResp.GetGpuList().GetItems()))
+ }
+
+ // Update the spec (change UUID to "GPU-5678")
+ got.Spec.Uuid = "GPU-5678"
+ updated, err := client.UpdateGpu(ctx, &pb.UpdateGpuRequest{
+ Gpu: got,
+ })
+ if err != nil {
+ t.Fatalf("UpdateGpu failed: %v", err)
+ }
+
+ if updated.GetSpec().GetUuid() != "GPU-5678" {
+ t.Errorf("expected UUID %q, got %q", "GPU-5678", updated.GetSpec().GetUuid())
+ }
+
+ // Verify change persists
+ getResp2, err := client.GetGpu(ctx, &pb.GetGpuRequest{
+ Name: gpuName,
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Fatalf("GetGpu (after update) failed: %v", err)
+ }
+
+ if getResp2.GetGpu().GetSpec().GetUuid() != "GPU-5678" {
+ t.Errorf("expected UUID %q after update, got %q", "GPU-5678", getResp2.GetGpu().GetSpec().GetUuid())
+ }
+
+ // Delete it
+ _, err = client.DeleteGpu(ctx, &pb.DeleteGpuRequest{
+ Name: gpuName,
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Fatalf("DeleteGpu failed: %v", err)
+ }
+
+ // List again, verify count=0
+ listResp2, err := client.ListGpus(ctx, &pb.ListGpusRequest{
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Fatalf("ListGpus (after delete) failed: %v", err)
+ }
+
+ if len(listResp2.GetGpuList().GetItems()) != 0 {
+ t.Errorf("expected 0 GPUs after delete, got %d", len(listResp2.GetGpuList().GetItems()))
+ }
+}
+
+// TestIntegration_Watch tests the streaming WatchGpus RPC.
+func TestIntegration_Watch(t *testing.T) {
+ client := testutil.NewTestGPUClient(t)
+ ctx := t.Context()
+
+ const gpuName = "GPU-aabbccdd-1122-3344-5566-778899aabbcc"
+
+ // Start a watch stream
+ stream, err := client.WatchGpus(ctx, &pb.WatchGpusRequest{
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Fatalf("WatchGpus failed to start: %v", err)
+ }
+
+ // Create a GPU in a separate goroutine after a brief delay.
+ // The WatchGpus RPC returns a stream only after the server-side watch
+ // is established. However, the gRPC client dial and server handler setup
+ // may not be fully synchronized, so a small delay ensures the watch is
+ // ready to receive events. The main goroutine uses a 5s timeout on Recv
+ // as the real synchronization mechanism.
+ doneCh := make(chan struct{})
+ go func() {
+ defer close(doneCh)
+ time.Sleep(100 * time.Millisecond)
+ _, err := client.CreateGpu(ctx, &pb.CreateGpuRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ },
+ Spec: &pb.GpuSpec{
+ Uuid: "GPU-WATCH-1",
+ },
+ },
+ })
+ if err != nil {
+ t.Errorf("CreateGpu in watch test failed: %v", err)
+ }
+ }()
+
+ // Wait for the ADDED event
+ timeout := time.After(5 * time.Second)
+ receivedEvent := false
+
+ for !receivedEvent {
+ select {
+ case <-timeout:
+ t.Fatal("timeout waiting for watch event")
+ default:
+ event, err := stream.Recv()
+ if err == io.EOF {
+ t.Fatal("stream closed before receiving event")
+ }
+ if err != nil {
+ t.Fatalf("stream.Recv() failed: %v", err)
+ }
+
+ if event.GetType() == "ADDED" && event.GetObject().GetMetadata().GetName() == gpuName {
+ receivedEvent = true
+ if event.GetObject().GetSpec().GetUuid() != "GPU-WATCH-1" {
+ t.Errorf("expected UUID %q, got %q", "GPU-WATCH-1", event.GetObject().GetSpec().GetUuid())
+ }
+ }
+ }
+ }
+
+ // Wait for the create goroutine to finish
+ <-doneCh
+
+ // Clean up
+ _, err = client.DeleteGpu(ctx, &pb.DeleteGpuRequest{
+ Name: gpuName,
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Errorf("cleanup DeleteGpu failed: %v", err)
+ }
+}
+
+// TestIntegration_WatchWithResourceVersion_OutOfRange verifies that requesting
+// a watch from a specific ResourceVersion returns codes.OutOfRange, because the
+// in-memory store does not support watch resume.
+func TestIntegration_WatchWithResourceVersion_OutOfRange(t *testing.T) {
+ client := testutil.NewTestGPUClient(t)
+ ctx := t.Context()
+
+ stream, err := client.WatchGpus(ctx, &pb.WatchGpusRequest{
+ Namespace: "default",
+ Opts: &pb.ListOptions{
+ ResourceVersion: "1",
+ },
+ })
+ if err != nil {
+ t.Fatalf("WatchGpus failed to open stream: %v", err)
+ }
+
+ // In gRPC server streaming, handler errors surface on Recv.
+ _, err = stream.Recv()
+ if err == nil {
+ t.Fatal("expected OutOfRange error for non-empty ResourceVersion, got nil")
+ }
+ if status.Code(err) != codes.OutOfRange {
+ t.Errorf("expected codes.OutOfRange, got %v: %v", status.Code(err), err)
+ }
+}
+
+// TestIntegration_UpdateStatus tests the status subresource update.
+func TestIntegration_UpdateStatus(t *testing.T) {
+ client := testutil.NewTestGPUClient(t)
+ ctx := t.Context()
+
+ const gpuName = "GPU-55667788-aabb-ccdd-eeff-001122334455"
+
+ // Create a GPU
+ created, err := client.CreateGpu(ctx, &pb.CreateGpuRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ },
+ Spec: &pb.GpuSpec{
+ Uuid: "GPU-STATUS-1",
+ },
+ },
+ })
+ if err != nil {
+ t.Fatalf("CreateGpu failed: %v", err)
+ }
+
+ // Update the status with a condition
+ updatedGpu, err := client.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ ResourceVersion: created.GetMetadata().GetResourceVersion(),
+ },
+ Status: &pb.GpuStatus{
+ Conditions: []*pb.Condition{
+ {
+ Type: "Ready",
+ Status: "True",
+ LastTransitionTime: timestamppb.Now(),
+ Reason: "TestReason",
+ Message: "Test message",
+ },
+ },
+ RecommendedAction: "No action needed",
+ },
+ },
+ })
+ if err != nil {
+ t.Fatalf("UpdateGpuStatus failed: %v", err)
+ }
+
+ if len(updatedGpu.GetStatus().GetConditions()) != 1 {
+ t.Errorf("expected 1 condition, got %d", len(updatedGpu.GetStatus().GetConditions()))
+ }
+
+ // Get the GPU and verify status was updated
+ getResp, err := client.GetGpu(ctx, &pb.GetGpuRequest{
+ Name: gpuName,
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Fatalf("GetGpu failed: %v", err)
+ }
+
+ gpu := getResp.GetGpu()
+ if len(gpu.GetStatus().GetConditions()) != 1 {
+ t.Errorf("expected 1 condition in retrieved GPU, got %d", len(gpu.GetStatus().GetConditions()))
+ }
+
+ cond := gpu.GetStatus().GetConditions()[0]
+ if cond.GetType() != "Ready" {
+ t.Errorf("expected condition type %q, got %q", "Ready", cond.GetType())
+ }
+ if cond.GetStatus() != "True" {
+ t.Errorf("expected condition status %q, got %q", "True", cond.GetStatus())
+ }
+ if cond.GetReason() != "TestReason" {
+ t.Errorf("expected condition reason %q, got %q", "TestReason", cond.GetReason())
+ }
+ if gpu.GetStatus().GetRecommendedAction() != "No action needed" {
+ t.Errorf("expected recommended action %q, got %q", "No action needed", gpu.GetStatus().GetRecommendedAction())
+ }
+
+ // Clean up
+ _, err = client.DeleteGpu(ctx, &pb.DeleteGpuRequest{
+ Name: gpuName,
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Errorf("cleanup DeleteGpu failed: %v", err)
+ }
+}
+
+// TestIntegration_ErrorCodes verifies correct gRPC error codes are returned.
+func TestIntegration_ErrorCodes(t *testing.T) {
+ client := testutil.NewTestGPUClient(t)
+ ctx := t.Context()
+
+ const gpuName = "GPU-deadbeef-dead-beef-dead-beefdeadbeef"
+
+ // Get non-existent GPU → codes.NotFound
+ _, err := client.GetGpu(ctx, &pb.GetGpuRequest{
+ Name: gpuName,
+ Namespace: "default",
+ })
+ if err == nil {
+ t.Fatal("expected error for non-existent GPU")
+ }
+ if status.Code(err) != codes.NotFound {
+ t.Errorf("expected codes.NotFound, got %v", status.Code(err))
+ }
+
+ // Create a GPU
+ _, err = client.CreateGpu(ctx, &pb.CreateGpuRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ },
+ Spec: &pb.GpuSpec{
+ Uuid: "GPU-ERROR-1",
+ },
+ },
+ })
+ if err != nil {
+ t.Fatalf("CreateGpu failed: %v", err)
+ }
+
+ // Create duplicate → codes.AlreadyExists
+ _, err = client.CreateGpu(ctx, &pb.CreateGpuRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ },
+ Spec: &pb.GpuSpec{
+ Uuid: "GPU-ERROR-2",
+ },
+ },
+ })
+ if err == nil {
+ t.Fatal("expected error for duplicate GPU creation")
+ }
+ if status.Code(err) != codes.AlreadyExists {
+ t.Errorf("expected codes.AlreadyExists, got %v", status.Code(err))
+ }
+
+ // Delete the GPU
+ _, err = client.DeleteGpu(ctx, &pb.DeleteGpuRequest{
+ Name: gpuName,
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Fatalf("DeleteGpu failed: %v", err)
+ }
+
+ // Delete non-existent → codes.NotFound
+ _, err = client.DeleteGpu(ctx, &pb.DeleteGpuRequest{
+ Name: gpuName,
+ Namespace: "default",
+ })
+ if err == nil {
+ t.Fatal("expected error for deleting non-existent GPU")
+ }
+ if status.Code(err) != codes.NotFound {
+ t.Errorf("expected codes.NotFound for delete, got %v", status.Code(err))
+ }
+}
diff --git a/pkg/storage/memory/factory.go b/pkg/storage/memory/factory.go
new file mode 100644
index 000000000..057dd2edb
--- /dev/null
+++ b/pkg/storage/memory/factory.go
@@ -0,0 +1,32 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memory
+
+import (
+ "k8s.io/apimachinery/pkg/runtime"
+ "k8s.io/apiserver/pkg/storage"
+ "k8s.io/apiserver/pkg/storage/storagebackend/factory"
+)
+
+// CreateStorage returns a new in-memory storage.Interface, a DestroyFunc, and any error.
+// This mirrors the signature of storagebackend/factory.Create() so it can be
+// used as a drop-in replacement in ServiceProvider.Install().
+func CreateStorage(codec runtime.Codec) (storage.Interface, factory.DestroyFunc, error) {
+ store := NewStore(codec)
+ destroy := func() {
+ // No resources to release for in-memory storage.
+ }
+ return store, destroy, nil
+}
diff --git a/pkg/storage/memory/factory_test.go b/pkg/storage/memory/factory_test.go
new file mode 100644
index 000000000..49a749e62
--- /dev/null
+++ b/pkg/storage/memory/factory_test.go
@@ -0,0 +1,62 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memory
+
+import (
+ "context"
+ "testing"
+
+ "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+ "k8s.io/apiserver/pkg/storage"
+)
+
+func TestCreateStorage(t *testing.T) {
+ s, destroy, err := CreateStorage(codec)
+ if err != nil {
+ t.Fatalf("CreateStorage failed: %v", err)
+ }
+ defer destroy()
+
+ if s == nil {
+ t.Fatal("expected non-nil storage.Interface")
+ }
+
+ // Verify it's functional by doing a basic Create + Get.
+ ctx := context.Background()
+ obj := newTestObject("factory-gpu", "default")
+ if err := s.Create(ctx, "/test/factory-gpu", obj, nil, 0); err != nil {
+ t.Fatalf("Create via factory storage failed: %v", err)
+ }
+
+ got := &unstructured.Unstructured{}
+ if err := s.Get(ctx, "/test/factory-gpu", storage.GetOptions{}, got); err != nil {
+ t.Fatalf("Get via factory storage failed: %v", err)
+ }
+
+ if got.GetName() != "factory-gpu" {
+ t.Errorf("expected name factory-gpu, got %s", got.GetName())
+ }
+}
+
+func TestCreateStorage_DestroyIsIdempotent(t *testing.T) {
+ _, destroy, err := CreateStorage(codec)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Should not panic when called multiple times.
+ destroy()
+ destroy()
+}
diff --git a/pkg/storage/memory/store.go b/pkg/storage/memory/store.go
new file mode 100644
index 000000000..27b085383
--- /dev/null
+++ b/pkg/storage/memory/store.go
@@ -0,0 +1,492 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memory
+
+import (
+ "bytes"
+ "context"
+ "fmt"
+ "strings"
+ "sync"
+
+ "k8s.io/apimachinery/pkg/api/meta"
+ "k8s.io/apimachinery/pkg/runtime"
+ "k8s.io/apimachinery/pkg/util/validation/field"
+ "k8s.io/apimachinery/pkg/watch"
+ "k8s.io/apiserver/pkg/storage"
+)
+
+// item holds an encoded object and its associated resource version.
+type item struct {
+ key string
+ data []byte
+ rv uint64
+}
+
+// Store is a thread-safe, in-memory implementation of storage.Interface.
+// Objects are stored as codec-encoded bytes keyed by hierarchical path strings.
+type Store struct {
+ codec runtime.Codec
+ mu sync.RWMutex
+ items map[string]*item
+ rev uint64
+ watchers *watchManager
+}
+
+// Compile-time interface compliance check.
+var _ storage.Interface = (*Store)(nil)
+
+// NewStore creates a new in-memory store that encodes and decodes objects
+// using the provided codec. The watch channel buffer uses the default size
+// (watchChannelSize). Use NewStoreWithOptions for custom buffer sizes.
+func NewStore(codec runtime.Codec) *Store {
+ return &Store{
+ codec: codec,
+ items: make(map[string]*item),
+ watchers: newWatchManager(watchChannelSize),
+ }
+}
+
+// Versioner returns the storage versioner used to manage resource versions on
+// API objects. This implementation uses the standard APIObjectVersioner.
+func (s *Store) Versioner() storage.Versioner {
+ return storage.APIObjectVersioner{}
+}
+
+// Create adds a new object at the given key. If an object already exists at
+// that key, a KeyExists error is returned. The out parameter, if non-nil, is
+// populated with the stored object including its assigned resource version.
+func (s *Store) Create(ctx context.Context, key string, obj, out runtime.Object, ttl uint64) error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ if _, exists := s.items[key]; exists {
+ return storage.NewKeyExistsError(key, 0)
+ }
+
+ s.rev++
+ rv := s.rev
+
+ if err := s.Versioner().PrepareObjectForStorage(obj); err != nil {
+ return fmt.Errorf("PrepareObjectForStorage failed: %w", err)
+ }
+
+ if err := s.Versioner().UpdateObject(obj, rv); err != nil {
+ return fmt.Errorf("UpdateObject failed: %w", err)
+ }
+
+ data, err := s.encode(obj)
+ if err != nil {
+ return err
+ }
+
+ s.items[key] = &item{
+ key: key,
+ data: data,
+ rv: rv,
+ }
+
+ if out != nil {
+ if err := s.decode(data, out); err != nil {
+ return err
+ }
+ }
+
+ // DeepCopy is required: watchers must receive an isolated snapshot.
+ // The copy runs under s.mu write lock, so watch-heavy workloads
+ // should keep stored objects small.
+ s.watchers.sendLocked(watch.Event{
+ Type: watch.Added,
+ Object: obj.DeepCopyObject(),
+ }, key)
+
+ return nil
+}
+
+// Delete removes the object at the given key. If the key does not exist,
+// a KeyNotFound error is returned. Preconditions and validation callbacks
+// are checked before deletion proceeds.
+func (s *Store) Delete(
+ ctx context.Context,
+ key string,
+ out runtime.Object,
+ preconditions *storage.Preconditions,
+ validateDeletion storage.ValidateObjectFunc,
+ cachedExistingObject runtime.Object,
+ opts storage.DeleteOptions,
+) error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ existing, ok := s.items[key]
+ if !ok {
+ return storage.NewKeyNotFoundError(key, 0)
+ }
+
+ existingObj, err := s.decodeNew(existing.data)
+ if err != nil {
+ return err
+ }
+
+ if err := s.checkPreconditions(key, preconditions, existingObj); err != nil {
+ return err
+ }
+
+ // validateDeletion must be fast and non-blocking. It runs while the store
+ // write lock is held; a slow callback freezes all storage operations.
+ if validateDeletion != nil {
+ if err := validateDeletion(ctx, existingObj); err != nil {
+ return err
+ }
+ }
+
+ delete(s.items, key)
+
+ s.rev++
+
+ if out != nil {
+ if err := s.decode(existing.data, out); err != nil {
+ return err
+ }
+ }
+
+ // Deep copy for watcher isolation.
+ s.watchers.sendLocked(watch.Event{
+ Type: watch.Deleted,
+ Object: existingObj.DeepCopyObject(),
+ }, key)
+
+ return nil
+}
+
+// Watch begins watching the specified key prefix. Events matching the key
+// prefix are sent on the returned watch.Interface. The watch is automatically
+// stopped when the context is cancelled.
+//
+// The in-memory store does not support resuming watches from a specific
+// ResourceVersion. Passing a non-empty ResourceVersion returns an error.
+func (s *Store) Watch(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) {
+ if opts.ResourceVersion != "" {
+ return nil, storage.NewInvalidError(field.ErrorList{
+ field.Invalid(
+ field.NewPath("resourceVersion"),
+ opts.ResourceVersion,
+ "in-memory store does not support watch resume from resource version",
+ ),
+ })
+ }
+
+ w := s.watchers.watch(key)
+ done := w.done // capture before spawning goroutine
+
+ go func() {
+ select {
+ case <-ctx.Done():
+ w.Stop()
+ case <-done:
+ // Watcher was stopped directly; goroutine can exit.
+ }
+ }()
+
+ return w, nil
+}
+
+// Get retrieves the object stored at the given key and decodes it into objPtr.
+// If the key does not exist and opts.IgnoreNotFound is false, a KeyNotFound
+// error is returned. If IgnoreNotFound is true, objPtr is left at its zero value.
+func (s *Store) Get(ctx context.Context, key string, opts storage.GetOptions, objPtr runtime.Object) error {
+ s.mu.RLock()
+ defer s.mu.RUnlock()
+
+ existing, ok := s.items[key]
+ if !ok {
+ if opts.IgnoreNotFound {
+ return nil
+ }
+
+ return storage.NewKeyNotFoundError(key, 0)
+ }
+
+ return s.decode(existing.data, objPtr)
+}
+
+// GetList retrieves all objects whose keys match the given prefix (when
+// opts.Recursive is true) or the exact key (otherwise), and populates
+// listObj with the matching items. The list's resource version is set to
+// the store's current revision.
+func (s *Store) GetList(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object) error {
+ s.mu.RLock()
+ defer s.mu.RUnlock()
+
+ prefix := key
+ if opts.Recursive && !strings.HasSuffix(prefix, "/") {
+ prefix += "/"
+ }
+
+ var objs []runtime.Object
+
+ for k, it := range s.items {
+ var match bool
+ if opts.Recursive {
+ match = strings.HasPrefix(k, prefix)
+ } else {
+ match = k == key
+ }
+
+ if !match {
+ continue
+ }
+
+ obj, err := s.decodeNew(it.data)
+ if err != nil {
+ return err
+ }
+
+ if !predicateEmpty(opts.Predicate) {
+ matches, err := opts.Predicate.Matches(obj)
+ if err != nil {
+ return err
+ }
+
+ if !matches {
+ continue
+ }
+ }
+
+ objs = append(objs, obj)
+ }
+
+ if err := meta.SetList(listObj, objs); err != nil {
+ return err
+ }
+
+ return s.setListRV(listObj, s.rev)
+}
+
+// GuaranteedUpdate reads the current object at the given key, passes it to
+// tryUpdate, and writes the result back. If the key does not exist and
+// ignoreNotFound is false, a KeyNotFound error is returned. The operation
+// is retried internally if the tryUpdate function returns a retriable error.
+func (s *Store) GuaranteedUpdate(
+ ctx context.Context,
+ key string,
+ destination runtime.Object,
+ ignoreNotFound bool,
+ preconditions *storage.Preconditions,
+ tryUpdate storage.UpdateFunc,
+ cachedExistingObject runtime.Object,
+) error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ existing, ok := s.items[key]
+
+ var currentObj runtime.Object
+ var currentRV uint64
+
+ if ok {
+ obj, err := s.decodeNew(existing.data)
+ if err != nil {
+ return err
+ }
+
+ currentObj = obj
+ currentRV = existing.rv
+ } else {
+ if !ignoreNotFound {
+ return storage.NewKeyNotFoundError(key, 0)
+ }
+
+ currentObj = destination.DeepCopyObject()
+ }
+
+ if err := s.checkPreconditions(key, preconditions, currentObj); err != nil {
+ return err
+ }
+
+ updated, _, err := tryUpdate(currentObj, storage.ResponseMeta{ResourceVersion: currentRV})
+ if err != nil {
+ return err
+ }
+
+ s.rev++
+ rv := s.rev
+
+ if err := s.Versioner().UpdateObject(updated, rv); err != nil {
+ return fmt.Errorf("UpdateObject failed: %w", err)
+ }
+
+ data, err := s.encode(updated)
+ if err != nil {
+ return err
+ }
+
+ s.items[key] = &item{
+ key: key,
+ data: data,
+ rv: rv,
+ }
+
+ if err := s.decode(data, destination); err != nil {
+ return err
+ }
+
+ evType := watch.Modified
+ if !ok {
+ evType = watch.Added
+ }
+
+ // Deep copy for watcher isolation.
+ s.watchers.sendLocked(watch.Event{
+ Type: evType,
+ Object: updated.DeepCopyObject(),
+ }, key)
+
+ return nil
+}
+
+// Stats returns basic storage statistics. Currently reports only the number
+// of stored objects.
+func (s *Store) Stats(ctx context.Context) (storage.Stats, error) {
+ s.mu.RLock()
+ defer s.mu.RUnlock()
+
+ return storage.Stats{
+ ObjectCount: int64(len(s.items)),
+ }, nil
+}
+
+// ReadinessCheck reports whether the store is ready. The in-memory store is
+// always ready, so this always returns nil.
+func (s *Store) ReadinessCheck() error {
+ return nil
+}
+
+// RequestWatchProgress is a no-op for the in-memory store. It exists to
+// satisfy the storage.Interface and is only meaningful for etcd-backed stores.
+func (s *Store) RequestWatchProgress(ctx context.Context) error {
+ return nil
+}
+
+// GetCurrentResourceVersion returns the store's current monotonic revision.
+func (s *Store) GetCurrentResourceVersion(ctx context.Context) (uint64, error) {
+ s.mu.RLock()
+ defer s.mu.RUnlock()
+
+ return s.rev, nil
+}
+
+// EnableResourceSizeEstimation is a no-op for the in-memory store. Size
+// estimation is only relevant for disk-backed storage backends.
+func (s *Store) EnableResourceSizeEstimation(storage.KeysFunc) error {
+ return nil
+}
+
+// CompactRevision returns the latest observed compacted revision. The
+// in-memory store does not perform compaction, so this always returns 0.
+func (s *Store) CompactRevision() int64 {
+ return 0
+}
+
+// --- internal helpers ---
+
+// encode serializes an object into bytes using the store's codec.
+func (s *Store) encode(obj runtime.Object) ([]byte, error) {
+ var buf bytes.Buffer
+ if err := s.codec.Encode(obj, &buf); err != nil {
+ return nil, fmt.Errorf("encode failed: %w", err)
+ }
+
+ return buf.Bytes(), nil
+}
+
+// decode deserializes bytes into an existing object using the store's codec.
+func (s *Store) decode(data []byte, into runtime.Object) error {
+ _, _, err := s.codec.Decode(data, nil, into)
+ if err != nil {
+ return fmt.Errorf("decode failed: %w", err)
+ }
+
+ return nil
+}
+
+// decodeNew deserializes bytes into a new object allocated by the codec.
+func (s *Store) decodeNew(data []byte) (runtime.Object, error) {
+ obj, _, err := s.codec.Decode(data, nil, nil)
+ if err != nil {
+ return nil, fmt.Errorf("decode failed: %w", err)
+ }
+
+ return obj, nil
+}
+
+// setListRV sets the resource version on a list object using the versioner.
+func (s *Store) setListRV(listObj runtime.Object, rv uint64) error {
+ return s.Versioner().UpdateList(listObj, rv, "", nil)
+}
+
+// predicateEmpty returns true if the predicate performs no filtering.
+// It guards against nil Label/Field selectors that would panic in
+// SelectionPredicate.Empty().
+func predicateEmpty(p storage.SelectionPredicate) bool {
+ if p.Label == nil && p.Field == nil {
+ return true
+ }
+
+ return p.Empty()
+}
+
+// checkPreconditions verifies that the given preconditions are met by the
+// existing object. Returns an error if UID or ResourceVersion do not match.
+func (s *Store) checkPreconditions(key string, preconditions *storage.Preconditions, obj runtime.Object) error {
+ if preconditions == nil {
+ return nil
+ }
+
+ if preconditions.UID != nil {
+ accessor, err := meta.Accessor(obj)
+ if err != nil {
+ return err
+ }
+
+ if accessor.GetUID() != *preconditions.UID {
+ return storage.NewInvalidObjError(key, fmt.Sprintf(
+ "precondition UID mismatch: expected %s, got %s",
+ *preconditions.UID, accessor.GetUID(),
+ ))
+ }
+ }
+
+ if preconditions.ResourceVersion != nil {
+ rv, err := s.Versioner().ObjectResourceVersion(obj)
+ if err != nil {
+ return err
+ }
+
+ expectedRV, err := s.Versioner().ParseResourceVersion(*preconditions.ResourceVersion)
+ if err != nil {
+ return err
+ }
+
+ if rv != expectedRV {
+ return storage.NewInvalidObjError(key, fmt.Sprintf(
+ "precondition ResourceVersion mismatch: expected %d, got %d",
+ expectedRV, rv,
+ ))
+ }
+ }
+
+ return nil
+}
diff --git a/pkg/storage/memory/store_test.go b/pkg/storage/memory/store_test.go
new file mode 100644
index 000000000..ffd6edc0f
--- /dev/null
+++ b/pkg/storage/memory/store_test.go
@@ -0,0 +1,794 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memory
+
+import (
+ "context"
+ "fmt"
+ "testing"
+ "time"
+
+ "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+ "k8s.io/apimachinery/pkg/runtime"
+ "k8s.io/apimachinery/pkg/types"
+ "k8s.io/apimachinery/pkg/watch"
+ "k8s.io/apiserver/pkg/storage"
+)
+
+// codec is the shared codec used by all tests. UnstructuredJSONScheme handles
+// encoding and decoding of unstructured.Unstructured objects without needing
+// a registered scheme or concrete Go types.
+var codec runtime.Codec = unstructured.UnstructuredJSONScheme
+
+// newTestObject builds an *unstructured.Unstructured with the given name and
+// namespace, suitable for storage in the test store.
+func newTestObject(name, namespace string) *unstructured.Unstructured {
+ return &unstructured.Unstructured{
+ Object: map[string]any{
+ "apiVersion": "v1",
+ "kind": "GPU",
+ "metadata": map[string]any{
+ "name": name,
+ "namespace": namespace,
+ },
+ },
+ }
+}
+
+func TestStore_CreateAndGet(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ obj := newTestObject("gpu-0", "default")
+ out := &unstructured.Unstructured{}
+
+ if err := s.Create(ctx, "/gpus/default/gpu-0", obj, out, 0); err != nil {
+ t.Fatalf("Create failed: %v", err)
+ }
+
+ // Verify resourceVersion was set on the output object.
+ rv := out.GetResourceVersion()
+ if rv == "" {
+ t.Fatal("expected resourceVersion to be set on out, got empty string")
+ }
+
+ if rv != "1" {
+ t.Fatalf("expected resourceVersion '1', got %q", rv)
+ }
+
+ // Get the object back.
+ got := &unstructured.Unstructured{}
+ if err := s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, got); err != nil {
+ t.Fatalf("Get failed: %v", err)
+ }
+
+ if got.GetName() != "gpu-0" {
+ t.Fatalf("expected name 'gpu-0', got %q", got.GetName())
+ }
+
+ if got.GetResourceVersion() != "1" {
+ t.Fatalf("expected resourceVersion '1', got %q", got.GetResourceVersion())
+ }
+}
+
+func TestStore_CreateDuplicate(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ obj := newTestObject("gpu-0", "default")
+
+ if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil {
+ t.Fatalf("first Create failed: %v", err)
+ }
+
+ err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0)
+ if err == nil {
+ t.Fatal("expected error on duplicate Create, got nil")
+ }
+
+ if !storage.IsExist(err) {
+ t.Fatalf("expected IsExist error, got: %v", err)
+ }
+}
+
+func TestStore_GetNotFound(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ got := &unstructured.Unstructured{}
+ err := s.Get(ctx, "/gpus/default/gpu-missing", storage.GetOptions{}, got)
+
+ if err == nil {
+ t.Fatal("expected error on Get for missing key, got nil")
+ }
+
+ if !storage.IsNotFound(err) {
+ t.Fatalf("expected IsNotFound error, got: %v", err)
+ }
+}
+
+func TestStore_GetList(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ // Create 3 objects under the same prefix.
+ for _, name := range []string{"gpu-0", "gpu-1", "gpu-2"} {
+ obj := newTestObject(name, "default")
+ if err := s.Create(ctx, "/gpus/default/"+name, obj, nil, 0); err != nil {
+ t.Fatalf("Create %s failed: %v", name, err)
+ }
+ }
+
+ list := &unstructured.UnstructuredList{}
+ opts := storage.ListOptions{
+ Recursive: true,
+ Predicate: storage.SelectionPredicate{},
+ }
+
+ if err := s.GetList(ctx, "/gpus/default", opts, list); err != nil {
+ t.Fatalf("GetList failed: %v", err)
+ }
+
+ if len(list.Items) != 3 {
+ t.Fatalf("expected 3 items, got %d", len(list.Items))
+ }
+
+ // Verify the list has a resource version.
+ if list.GetResourceVersion() == "" {
+ t.Fatal("expected list resourceVersion to be set")
+ }
+}
+
+func TestStore_GuaranteedUpdate(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ obj := newTestObject("gpu-0", "default")
+ if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil {
+ t.Fatalf("Create failed: %v", err)
+ }
+
+ dest := &unstructured.Unstructured{}
+ err := s.GuaranteedUpdate(ctx, "/gpus/default/gpu-0", dest, false, nil,
+ func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) {
+ u := input.(*unstructured.Unstructured)
+ labels := u.GetLabels()
+ if labels == nil {
+ labels = make(map[string]string)
+ }
+
+ labels["test-key"] = "test-value"
+ u.SetLabels(labels)
+
+ return u, nil, nil
+ }, nil)
+ if err != nil {
+ t.Fatalf("GuaranteedUpdate failed: %v", err)
+ }
+
+ // Verify the label was persisted.
+ got := &unstructured.Unstructured{}
+ if err := s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, got); err != nil {
+ t.Fatalf("Get after update failed: %v", err)
+ }
+
+ labels := got.GetLabels()
+ if labels["test-key"] != "test-value" {
+ t.Fatalf("expected label 'test-key'='test-value', got labels: %v", labels)
+ }
+
+ // Verify resourceVersion was incremented.
+ if got.GetResourceVersion() != "2" {
+ t.Fatalf("expected resourceVersion '2' after update, got %q", got.GetResourceVersion())
+ }
+}
+
+func TestStore_GuaranteedUpdate_NotFound(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ dest := &unstructured.Unstructured{}
+ err := s.GuaranteedUpdate(ctx, "/gpus/default/gpu-missing", dest, false, nil,
+ func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) {
+ return input, nil, nil
+ }, nil)
+
+ if err == nil {
+ t.Fatal("expected error on GuaranteedUpdate for missing key with ignoreNotFound=false")
+ }
+
+ if !storage.IsNotFound(err) {
+ t.Fatalf("expected IsNotFound error, got: %v", err)
+ }
+}
+
+func TestStore_Delete(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ obj := newTestObject("gpu-0", "default")
+ if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil {
+ t.Fatalf("Create failed: %v", err)
+ }
+
+ out := &unstructured.Unstructured{}
+ err := s.Delete(ctx, "/gpus/default/gpu-0", out, nil, nil, nil, storage.DeleteOptions{})
+ if err != nil {
+ t.Fatalf("Delete failed: %v", err)
+ }
+
+ if out.GetName() != "gpu-0" {
+ t.Fatalf("expected deleted object name 'gpu-0', got %q", out.GetName())
+ }
+
+ // Verify the object is gone.
+ got := &unstructured.Unstructured{}
+ err = s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, got)
+
+ if err == nil {
+ t.Fatal("expected NotFound error after delete, got nil")
+ }
+
+ if !storage.IsNotFound(err) {
+ t.Fatalf("expected IsNotFound error, got: %v", err)
+ }
+}
+
+func TestStore_DeleteNotFound(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ out := &unstructured.Unstructured{}
+ err := s.Delete(ctx, "/gpus/default/gpu-missing", out, nil, nil, nil, storage.DeleteOptions{})
+
+ if err == nil {
+ t.Fatal("expected error on Delete for missing key, got nil")
+ }
+
+ if !storage.IsNotFound(err) {
+ t.Fatalf("expected IsNotFound error, got: %v", err)
+ }
+}
+
+func TestStore_Watch(t *testing.T) {
+ s := NewStore(codec)
+ ctx := t.Context()
+
+ // Watch subscription is synchronous — the watcher is registered before
+ // Watch() returns. The subsequent Create() will acquire the store lock
+ // and broadcast to all registered watchers, including ours.
+ w, err := s.Watch(ctx, "/gpus/default/", storage.ListOptions{})
+ if err != nil {
+ t.Fatalf("Watch failed: %v", err)
+ }
+
+ defer w.Stop()
+
+ // Create object — guaranteed to notify our watcher.
+ obj := newTestObject("gpu-0", "default")
+ if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil {
+ t.Fatalf("Create failed: %v", err)
+ }
+
+ select {
+ case ev := <-w.ResultChan():
+ if ev.Type != watch.Added {
+ t.Fatalf("expected ADDED event, got %v", ev.Type)
+ }
+
+ u, ok := ev.Object.(*unstructured.Unstructured)
+ if !ok {
+ t.Fatalf("expected *unstructured.Unstructured, got %T", ev.Object)
+ }
+
+ if u.GetName() != "gpu-0" {
+ t.Fatalf("expected event object name 'gpu-0', got %q", u.GetName())
+ }
+ case <-time.After(2 * time.Second):
+ t.Fatal("timed out waiting for watch event")
+ }
+}
+
+func TestStore_Watch_Delete(t *testing.T) {
+ s := NewStore(codec)
+ ctx := t.Context()
+
+ // Create the object first, before starting the watch.
+ obj := newTestObject("gpu-0", "default")
+ if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil {
+ t.Fatalf("Create failed: %v", err)
+ }
+
+ w, err := s.Watch(ctx, "/gpus/default/", storage.ListOptions{})
+ if err != nil {
+ t.Fatalf("Watch failed: %v", err)
+ }
+
+ defer w.Stop()
+
+ // Delete the object; the watcher should receive a DELETED event.
+ out := &unstructured.Unstructured{}
+ if err := s.Delete(ctx, "/gpus/default/gpu-0", out, nil, nil, nil, storage.DeleteOptions{}); err != nil {
+ t.Fatalf("Delete failed: %v", err)
+ }
+
+ select {
+ case ev := <-w.ResultChan():
+ if ev.Type != watch.Deleted {
+ t.Fatalf("expected DELETED event, got %v", ev.Type)
+ }
+
+ u, ok := ev.Object.(*unstructured.Unstructured)
+ if !ok {
+ t.Fatalf("expected *unstructured.Unstructured, got %T", ev.Object)
+ }
+
+ if u.GetName() != "gpu-0" {
+ t.Fatalf("expected event object name 'gpu-0', got %q", u.GetName())
+ }
+ case <-time.After(2 * time.Second):
+ t.Fatal("timed out waiting for DELETED watch event")
+ }
+}
+
+func TestStore_Stats(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ for _, name := range []string{"gpu-0", "gpu-1"} {
+ obj := newTestObject(name, "default")
+ if err := s.Create(ctx, "/gpus/default/"+name, obj, nil, 0); err != nil {
+ t.Fatalf("Create %s failed: %v", name, err)
+ }
+ }
+
+ stats, err := s.Stats(ctx)
+ if err != nil {
+ t.Fatalf("Stats failed: %v", err)
+ }
+
+ if stats.ObjectCount != 2 {
+ t.Fatalf("expected ObjectCount 2, got %d", stats.ObjectCount)
+ }
+}
+
+func TestStore_ReadinessCheck(t *testing.T) {
+ s := NewStore(codec)
+
+ if err := s.ReadinessCheck(); err != nil {
+ t.Fatalf("ReadinessCheck failed: %v", err)
+ }
+}
+
+func TestStore_GetCurrentResourceVersion(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ rv0, err := s.GetCurrentResourceVersion(ctx)
+ if err != nil {
+ t.Fatalf("GetCurrentResourceVersion failed: %v", err)
+ }
+
+ if rv0 != 0 {
+ t.Fatalf("expected initial resourceVersion 0, got %d", rv0)
+ }
+
+ // Create two objects; each should increment the revision.
+ for _, name := range []string{"gpu-0", "gpu-1"} {
+ obj := newTestObject(name, "default")
+ if err := s.Create(ctx, "/gpus/default/"+name, obj, nil, 0); err != nil {
+ t.Fatalf("Create %s failed: %v", name, err)
+ }
+ }
+
+ rv2, err := s.GetCurrentResourceVersion(ctx)
+ if err != nil {
+ t.Fatalf("GetCurrentResourceVersion failed: %v", err)
+ }
+
+ if rv2 != 2 {
+ t.Fatalf("expected resourceVersion 2 after two creates, got %d", rv2)
+ }
+}
+
+func TestStore_DeleteWithPreconditions(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ obj := newTestObject("gpu-0", "default")
+ obj.SetUID("test-uid-123")
+
+ out := &unstructured.Unstructured{}
+ if err := s.Create(ctx, "/gpus/default/gpu-0", obj, out, 0); err != nil {
+ t.Fatalf("Create failed: %v", err)
+ }
+
+ // Delete with wrong UID precondition should fail.
+ wrongUID := types.UID("wrong-uid")
+ precond := &storage.Preconditions{UID: &wrongUID}
+ delOut := &unstructured.Unstructured{}
+ err := s.Delete(ctx, "/gpus/default/gpu-0", delOut, precond, nil, nil, storage.DeleteOptions{})
+ if err == nil {
+ t.Fatal("expected error on Delete with wrong UID precondition, got nil")
+ }
+
+ // Verify the object still exists.
+ got := &unstructured.Unstructured{}
+ if err := s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, got); err != nil {
+ t.Fatalf("Get after failed delete should succeed: %v", err)
+ }
+
+ // Delete with correct UID precondition should succeed.
+ correctUID := types.UID("test-uid-123")
+ precond = &storage.Preconditions{UID: &correctUID}
+ delOut = &unstructured.Unstructured{}
+ if err := s.Delete(ctx, "/gpus/default/gpu-0", delOut, precond, nil, nil, storage.DeleteOptions{}); err != nil {
+ t.Fatalf("Delete with correct UID precondition failed: %v", err)
+ }
+
+ if delOut.GetName() != "gpu-0" {
+ t.Fatalf("expected deleted object name 'gpu-0', got %q", delOut.GetName())
+ }
+
+ // Verify the object is gone.
+ err = s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, &unstructured.Unstructured{})
+ if err == nil {
+ t.Fatal("expected NotFound error after delete, got nil")
+ }
+
+ if !storage.IsNotFound(err) {
+ t.Fatalf("expected IsNotFound error, got: %v", err)
+ }
+}
+
+func TestStore_GuaranteedUpdate_Preconditions(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ obj := newTestObject("gpu-0", "default")
+ obj.SetUID("known-uid-456")
+
+ if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil {
+ t.Fatalf("Create failed: %v", err)
+ }
+
+ // GuaranteedUpdate with wrong UID precondition should fail.
+ wrongUID := types.UID("wrong-uid")
+ precond := &storage.Preconditions{UID: &wrongUID}
+ dest := &unstructured.Unstructured{}
+ err := s.GuaranteedUpdate(ctx, "/gpus/default/gpu-0", dest, false, precond,
+ func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) {
+ return input, nil, nil
+ }, nil)
+ if err == nil {
+ t.Fatal("expected error on GuaranteedUpdate with wrong UID precondition, got nil")
+ }
+
+ // Verify the object was not modified (still at resourceVersion 1).
+ got := &unstructured.Unstructured{}
+ if err := s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, got); err != nil {
+ t.Fatalf("Get failed: %v", err)
+ }
+
+ if got.GetResourceVersion() != "1" {
+ t.Fatalf("expected resourceVersion '1' (unmodified), got %q", got.GetResourceVersion())
+ }
+
+ // GuaranteedUpdate with correct UID precondition should succeed.
+ correctUID := types.UID("known-uid-456")
+ precond = &storage.Preconditions{UID: &correctUID}
+ dest = &unstructured.Unstructured{}
+ err = s.GuaranteedUpdate(ctx, "/gpus/default/gpu-0", dest, false, precond,
+ func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) {
+ u := input.(*unstructured.Unstructured)
+ labels := u.GetLabels()
+ if labels == nil {
+ labels = make(map[string]string)
+ }
+
+ labels["updated"] = "true"
+ u.SetLabels(labels)
+
+ return u, nil, nil
+ }, nil)
+ if err != nil {
+ t.Fatalf("GuaranteedUpdate with correct UID precondition failed: %v", err)
+ }
+
+ // Verify the update was applied.
+ got = &unstructured.Unstructured{}
+ if err := s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, got); err != nil {
+ t.Fatalf("Get after update failed: %v", err)
+ }
+
+ if got.GetLabels()["updated"] != "true" {
+ t.Fatalf("expected label 'updated'='true', got labels: %v", got.GetLabels())
+ }
+
+ if got.GetResourceVersion() != "2" {
+ t.Fatalf("expected resourceVersion '2' after update, got %q", got.GetResourceVersion())
+ }
+}
+
+func TestStore_GuaranteedUpdate_IgnoreNotFound(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ dest := &unstructured.Unstructured{}
+ var receivedEmpty bool
+ err := s.GuaranteedUpdate(ctx, "/gpus/default/gpu-new", dest, true, nil,
+ func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) {
+ u := input.(*unstructured.Unstructured)
+ // When ignoreNotFound is true and the key doesn't exist, the input
+ // should be a zero-value object (deep copy of destination).
+ if u.GetName() == "" && u.GetNamespace() == "" {
+ receivedEmpty = true
+ }
+
+ // Populate the object so it gets created.
+ u.SetUnstructuredContent(map[string]any{
+ "apiVersion": "v1",
+ "kind": "GPU",
+ "metadata": map[string]any{
+ "name": "gpu-new",
+ "namespace": "default",
+ },
+ })
+
+ return u, nil, nil
+ }, nil)
+ if err != nil {
+ t.Fatalf("GuaranteedUpdate with ignoreNotFound=true failed: %v", err)
+ }
+
+ if !receivedEmpty {
+ t.Fatal("expected tryUpdate to receive a zero-value object, but it did not")
+ }
+
+ // Verify the object was created and can be retrieved.
+ got := &unstructured.Unstructured{}
+ if err := s.Get(ctx, "/gpus/default/gpu-new", storage.GetOptions{}, got); err != nil {
+ t.Fatalf("Get after GuaranteedUpdate (ignoreNotFound) failed: %v", err)
+ }
+
+ if got.GetName() != "gpu-new" {
+ t.Fatalf("expected name 'gpu-new', got %q", got.GetName())
+ }
+
+ if got.GetResourceVersion() == "" {
+ t.Fatal("expected resourceVersion to be set, got empty string")
+ }
+}
+
+func TestStore_Watch_Modified(t *testing.T) {
+ s := NewStore(codec)
+ ctx := t.Context()
+
+ w, err := s.Watch(ctx, "/gpus/default/", storage.ListOptions{})
+ if err != nil {
+ t.Fatalf("Watch failed: %v", err)
+ }
+
+ defer w.Stop()
+
+ // Create an object.
+ obj := newTestObject("gpu-0", "default")
+ if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil {
+ t.Fatalf("Create failed: %v", err)
+ }
+
+ // Consume the ADDED event.
+ select {
+ case ev := <-w.ResultChan():
+ if ev.Type != watch.Added {
+ t.Fatalf("expected ADDED event, got %v", ev.Type)
+ }
+ case <-time.After(2 * time.Second):
+ t.Fatal("timed out waiting for ADDED watch event")
+ }
+
+ // Update the object via GuaranteedUpdate.
+ dest := &unstructured.Unstructured{}
+ err = s.GuaranteedUpdate(ctx, "/gpus/default/gpu-0", dest, false, nil,
+ func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) {
+ u := input.(*unstructured.Unstructured)
+ labels := u.GetLabels()
+ if labels == nil {
+ labels = make(map[string]string)
+ }
+
+ labels["modified"] = "true"
+ u.SetLabels(labels)
+
+ return u, nil, nil
+ }, nil)
+ if err != nil {
+ t.Fatalf("GuaranteedUpdate failed: %v", err)
+ }
+
+ // Verify a MODIFIED event is received.
+ select {
+ case ev := <-w.ResultChan():
+ if ev.Type != watch.Modified {
+ t.Fatalf("expected MODIFIED event, got %v", ev.Type)
+ }
+
+ u, ok := ev.Object.(*unstructured.Unstructured)
+ if !ok {
+ t.Fatalf("expected *unstructured.Unstructured, got %T", ev.Object)
+ }
+
+ if u.GetLabels()["modified"] != "true" {
+ t.Fatalf("expected label 'modified'='true' on event object, got labels: %v", u.GetLabels())
+ }
+ case <-time.After(2 * time.Second):
+ t.Fatal("timed out waiting for MODIFIED watch event")
+ }
+}
+
+func TestStore_Watch_KeyPrefixFiltering(t *testing.T) {
+ s := NewStore(codec)
+ ctx := t.Context()
+
+ // Watch only the /gpus/default/ prefix.
+ w, err := s.Watch(ctx, "/gpus/default/", storage.ListOptions{})
+ if err != nil {
+ t.Fatalf("Watch failed: %v", err)
+ }
+
+ defer w.Stop()
+
+ // Create an object under a different namespace; should NOT produce an event.
+ otherObj := newTestObject("gpu-0", "other-ns")
+ if err := s.Create(ctx, "/gpus/other-ns/gpu-0", otherObj, nil, 0); err != nil {
+ t.Fatalf("Create other-ns object failed: %v", err)
+ }
+
+ // Verify no event is received within a short timeout.
+ select {
+ case ev := <-w.ResultChan():
+ t.Fatalf("expected no event for other-ns object, but got %v event", ev.Type)
+ case <-time.After(500 * time.Millisecond):
+ // Good: no event received.
+ }
+
+ // Create an object under the watched prefix; SHOULD produce an ADDED event.
+ defaultObj := newTestObject("gpu-0", "default")
+ if err := s.Create(ctx, "/gpus/default/gpu-0", defaultObj, nil, 0); err != nil {
+ t.Fatalf("Create default object failed: %v", err)
+ }
+
+ select {
+ case ev := <-w.ResultChan():
+ if ev.Type != watch.Added {
+ t.Fatalf("expected ADDED event, got %v", ev.Type)
+ }
+
+ u, ok := ev.Object.(*unstructured.Unstructured)
+ if !ok {
+ t.Fatalf("expected *unstructured.Unstructured, got %T", ev.Object)
+ }
+
+ if u.GetName() != "gpu-0" {
+ t.Fatalf("expected event object name 'gpu-0', got %q", u.GetName())
+ }
+ case <-time.After(2 * time.Second):
+ t.Fatal("timed out waiting for ADDED watch event for default namespace object")
+ }
+}
+
+func TestStore_GetIgnoreNotFound(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ got := &unstructured.Unstructured{}
+ err := s.Get(ctx, "/gpus/default/gpu-missing", storage.GetOptions{IgnoreNotFound: true}, got)
+ if err != nil {
+ t.Fatalf("expected no error with IgnoreNotFound=true, got: %v", err)
+ }
+
+ // The object should be at its zero value (no name set).
+ if got.GetName() != "" {
+ t.Fatalf("expected empty name on zero-value object, got %q", got.GetName())
+ }
+}
+
+func TestStore_GetList_NonRecursive(t *testing.T) {
+ s := NewStore(codec)
+ ctx := context.Background()
+
+ // Create two objects under the same prefix.
+ for _, name := range []string{"gpu-0", "gpu-1"} {
+ obj := newTestObject(name, "default")
+ if err := s.Create(ctx, "/gpus/default/"+name, obj, nil, 0); err != nil {
+ t.Fatalf("Create %s failed: %v", name, err)
+ }
+ }
+
+ // GetList with Recursive=false on an exact key should return only that one item.
+ list := &unstructured.UnstructuredList{}
+ opts := storage.ListOptions{
+ Recursive: false,
+ Predicate: storage.SelectionPredicate{},
+ }
+
+ if err := s.GetList(ctx, "/gpus/default/gpu-0", opts, list); err != nil {
+ t.Fatalf("GetList failed: %v", err)
+ }
+
+ if len(list.Items) != 1 {
+ t.Fatalf("expected 1 item with non-recursive GetList, got %d", len(list.Items))
+ }
+
+ if list.Items[0].GetName() != "gpu-0" {
+ t.Fatalf("expected item name 'gpu-0', got %q", list.Items[0].GetName())
+ }
+}
+
+func TestStore_ImplementsInterface(t *testing.T) {
+ // Compile-time check that *Store satisfies storage.Interface.
+ var _ storage.Interface = (*Store)(nil)
+}
+
+func TestStore_Watch_RejectsResourceVersion(t *testing.T) {
+ s := NewStore(codec)
+ ctx := t.Context()
+
+ _, err := s.Watch(ctx, "/gpus/default/", storage.ListOptions{
+ ResourceVersion: "5",
+ })
+ if err == nil {
+ t.Fatal("expected error when Watch is called with non-empty ResourceVersion, got nil")
+ }
+}
+
+func TestStore_Watch_EventDropOnFullBuffer(t *testing.T) {
+ s := NewStore(codec)
+ ctx := t.Context()
+
+ w, err := s.Watch(ctx, "/gpus/default/", storage.ListOptions{})
+ if err != nil {
+ t.Fatalf("Watch failed: %v", err)
+ }
+ defer w.Stop()
+
+ // Fill the channel buffer (watchChannelSize = 100) plus overflow.
+ for i := 0; i < watchChannelSize+10; i++ {
+ name := fmt.Sprintf("gpu-%d", i)
+ obj := newTestObject(name, "default")
+ if err := s.Create(ctx, "/gpus/default/"+name, obj, nil, 0); err != nil {
+ t.Fatalf("Create %s failed: %v", name, err)
+ }
+ }
+
+ // Drain the channel. We should get exactly watchChannelSize events
+ // (the rest were dropped because the buffer was full).
+ received := 0
+ for {
+ select {
+ case _, ok := <-w.ResultChan():
+ if !ok {
+ t.Fatal("channel unexpectedly closed")
+ }
+ received++
+ default:
+ goto done
+ }
+ }
+done:
+ if received != watchChannelSize {
+ t.Fatalf("expected %d events (buffer size), got %d", watchChannelSize, received)
+ }
+}
diff --git a/pkg/storage/memory/watch.go b/pkg/storage/memory/watch.go
new file mode 100644
index 000000000..6d6f7dd9b
--- /dev/null
+++ b/pkg/storage/memory/watch.go
@@ -0,0 +1,130 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memory
+
+import (
+ "strings"
+ "sync"
+ "sync/atomic"
+
+ "k8s.io/apimachinery/pkg/watch"
+ "k8s.io/klog/v2"
+)
+
+const watchChannelSize = 100
+
+// watchManager tracks active watchers and broadcasts events to them.
+// It uses its own mutex, separate from Store.mu, because sendLocked
+// is called while the Store write lock is held.
+type watchManager struct {
+ mu sync.Mutex
+ watchers map[int]*memoryWatcher
+ nextID int
+ watchBufferSize int
+}
+
+func newWatchManager(bufferSize int) *watchManager {
+ return &watchManager{
+ watchers: make(map[int]*memoryWatcher),
+ watchBufferSize: bufferSize,
+ }
+}
+
+// watch creates a new watcher for the given key prefix and registers it.
+// The caller must cancel the context or call Stop() to clean up.
+func (wm *watchManager) watch(key string) *memoryWatcher {
+ wm.mu.Lock()
+ defer wm.mu.Unlock()
+
+ id := wm.nextID
+ wm.nextID++
+
+ w := &memoryWatcher{
+ id: id,
+ key: key,
+ ch: make(chan watch.Event, wm.watchBufferSize),
+ done: make(chan struct{}),
+ parent: wm,
+ }
+
+ wm.watchers[id] = w
+
+ return w
+}
+
+// sendLocked broadcasts an event to all registered watchers whose key prefix
+// matches the event's object key. This method is called while Store.mu is
+// held (write lock), so it uses its own mutex for watcher iteration.
+// Sends are non-blocking: if a watcher's channel is full, the event is dropped.
+func (wm *watchManager) sendLocked(ev watch.Event, objectKey string) {
+ wm.mu.Lock()
+ defer wm.mu.Unlock()
+
+ for _, w := range wm.watchers {
+ if !strings.HasPrefix(objectKey, w.key) {
+ continue
+ }
+
+ select {
+ case w.ch <- ev:
+ default:
+ w.droppedEvents.Add(1)
+ }
+ }
+}
+
+// remove unregisters a watcher by ID.
+func (wm *watchManager) remove(id int) {
+ wm.mu.Lock()
+ defer wm.mu.Unlock()
+
+ delete(wm.watchers, id)
+}
+
+// memoryWatcher implements watch.Interface for in-memory storage events.
+type memoryWatcher struct {
+ id int
+ key string
+ ch chan watch.Event
+ done chan struct{}
+ once sync.Once
+ parent *watchManager
+ droppedEvents atomic.Int64
+}
+
+var _ watch.Interface = (*memoryWatcher)(nil)
+
+// ResultChan returns the channel that receives watch events.
+func (w *memoryWatcher) ResultChan() <-chan watch.Event {
+ return w.ch
+}
+
+// Stop terminates the watcher, unregisters it from the parent manager,
+// and closes the result channel. It is safe to call multiple times.
+func (w *memoryWatcher) Stop() {
+ w.once.Do(func() {
+ if dropped := w.droppedEvents.Load(); dropped > 0 {
+ klog.V(2).InfoS("Watch stopped with dropped events",
+ "watcherID", w.id,
+ "key", w.key,
+ "droppedEvents", dropped,
+ )
+ }
+
+ w.parent.remove(w.id)
+ close(w.done)
+ close(w.ch)
+ })
+}
diff --git a/pkg/storage/storagebackend/config.go b/pkg/storage/storagebackend/config.go
index f6867f337..840f52708 100644
--- a/pkg/storage/storagebackend/config.go
+++ b/pkg/storage/storagebackend/config.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -28,6 +28,10 @@ type Config struct {
KineSocketPath string
DatabaseDir string
+ // InMemory skips Kine/SQLite entirely. Services supply their own
+ // in-memory storage.Interface, so the backend only needs to report ready.
+ InMemory bool
+
StorageConfig apistorage.Config
}
@@ -40,6 +44,7 @@ func NewConfig(ctx context.Context, opts options.CompletedOptions) (*Config, err
KineConfig: opts.KineConfig,
KineSocketPath: opts.KineSocketPath,
DatabaseDir: opts.DatabaseDir,
+ InMemory: opts.InMemory,
}
if err := opts.ApplyTo(&config.StorageConfig); err != nil {
diff --git a/pkg/storage/storagebackend/config_test.go b/pkg/storage/storagebackend/config_test.go
index ed5b5fcc3..e665891c8 100644
--- a/pkg/storage/storagebackend/config_test.go
+++ b/pkg/storage/storagebackend/config_test.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@ func TestNewConfig(t *testing.T) {
ctx := context.Background()
opts := options.NewOptions()
+ opts.InMemory = false
opts.DatabasePath = "/tmp/nvsentinel/test.db"
completedOpts, err := opts.Complete()
diff --git a/pkg/storage/storagebackend/options/options.go b/pkg/storage/storagebackend/options/options.go
index 306d02b4f..8951abbcf 100644
--- a/pkg/storage/storagebackend/options/options.go
+++ b/pkg/storage/storagebackend/options/options.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -28,6 +28,10 @@ import (
)
type Options struct {
+ // InMemory skips the Kine/SQLite storage backend entirely.
+ // When true, services provide their own in-memory storage.Interface.
+ InMemory bool
+
DatabasePath string
CompactionInterval time.Duration
CompactionBatchSize int64
@@ -49,6 +53,7 @@ type CompletedOptions struct {
func NewOptions() *Options {
return &Options{
+ InMemory: true,
DatabasePath: "/var/lib/nvidia-device-api/state.db",
CompactionInterval: 5 * time.Minute,
CompactionBatchSize: 1000,
@@ -64,6 +69,9 @@ func (o *Options) AddFlags(fss *cliflag.NamedFlagSets) {
storageFs := fss.FlagSet("storage")
+ storageFs.BoolVar(&o.InMemory, "in-memory", o.InMemory,
+ "Use in-memory storage instead of SQLite/Kine. Services provide their own storage.Interface.")
+
storageFs.StringVar(&o.DatabasePath, "database-path", o.DatabasePath,
"The path to the SQLite database file. Must be an absolute path.")
@@ -80,6 +88,12 @@ func (o *Options) Complete() (CompletedOptions, error) {
return CompletedOptions{}, nil
}
+ // In-memory mode skips all Kine/SQLite configuration.
+ if o.InMemory {
+ completed := completedOptions{Options: *o}
+ return CompletedOptions{completedOptions: &completed}, nil
+ }
+
if o.KineSocketPath == "" {
o.KineSocketPath = "/var/run/nvidia-device-api/kine.sock"
}
@@ -127,6 +141,11 @@ func (o *Options) Validate() []error {
return nil
}
+ // In-memory mode requires no Kine/SQLite configuration.
+ if o.InMemory {
+ return nil
+ }
+
allErrors := []error{}
if o.DatabasePath == "" {
diff --git a/pkg/storage/storagebackend/options/options_test.go b/pkg/storage/storagebackend/options/options_test.go
index 9079915fb..e5cfe1e83 100644
--- a/pkg/storage/storagebackend/options/options_test.go
+++ b/pkg/storage/storagebackend/options/options_test.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -61,6 +61,7 @@ func TestAddFlags(t *testing.T) {
func TestComplete(t *testing.T) {
t.Run("Default assignments", func(t *testing.T) {
opts := NewOptions()
+ opts.InMemory = false
opts.DatabasePath = ""
opts.KineSocketPath = ""
@@ -85,6 +86,7 @@ func TestComplete(t *testing.T) {
t.Run("Trims unix prefix from SocketPath", func(t *testing.T) {
opts := NewOptions()
+ opts.InMemory = false
opts.KineSocketPath = "unix:///tmp/test.sock"
completed, _ := opts.Complete()
@@ -95,6 +97,7 @@ func TestComplete(t *testing.T) {
t.Run("Maps intervals to KineConfig", func(t *testing.T) {
opts := NewOptions()
+ opts.InMemory = false
opts.CompactionInterval = 10 * time.Minute
opts.WatchProgressNotifyInterval = 15 * time.Second
@@ -181,6 +184,7 @@ func TestValidate(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
opts := NewOptions()
+ opts.InMemory = false
tt.modify(opts)
completed, err := opts.Complete()
@@ -211,6 +215,7 @@ func TestValidate(t *testing.T) {
func TestApplyTo(t *testing.T) {
opts := NewOptions()
+ opts.InMemory = false
completed, _ := opts.Complete()
storageCfg := &apistorage.Config{}
diff --git a/pkg/storage/storagebackend/storage.go b/pkg/storage/storagebackend/storage.go
index 2502efac9..ab790b4f5 100644
--- a/pkg/storage/storagebackend/storage.go
+++ b/pkg/storage/storagebackend/storage.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@ import (
"path/filepath"
"strings"
"sync/atomic"
+ "syscall"
"time"
"github.com/k3s-io/kine/pkg/endpoint"
@@ -39,6 +40,10 @@ type Storage struct {
StorageConfig apistorage.Config
ETCDConfig *endpoint.ETCDConfig
+ // InMemory skips Kine/SQLite entirely. When true, the storage backend
+ // reports ready immediately and services use their own in-memory storage.
+ InMemory bool
+
isReady atomic.Bool
}
@@ -52,10 +57,15 @@ func (c *CompletedConfig) New() (*Storage, error) {
KineSocketPath: c.KineSocketPath,
DatabaseDir: c.DatabaseDir,
StorageConfig: c.StorageConfig,
+ InMemory: c.InMemory,
}, nil
}
func (s *Storage) PrepareRun(ctx context.Context) (preparedStorage, error) {
+ if s.InMemory {
+ return preparedStorage{s}, nil
+ }
+
if err := s.prepareFilesystem(ctx); err != nil {
return preparedStorage{}, err
}
@@ -101,9 +111,22 @@ func (s *preparedStorage) Run(ctx context.Context) error {
func (s *Storage) run(ctx context.Context) error {
logger := klog.FromContext(ctx)
+ if s.InMemory {
+ logger.V(2).Info("Starting in-memory storage backend (no persistence)")
+ s.isReady.Store(true)
+ <-ctx.Done()
+ logger.Info("Shutting down in-memory storage backend")
+ s.isReady.Store(false)
+ return nil
+ }
+
logger.V(2).Info("Starting storage backend", "database", s.KineConfig.Endpoint)
s.isReady.Store(false)
+ // Restrict permissions on new files (socket) before Kine creates it.
+ oldUmask := syscall.Umask(0117) // Creates socket as 0660 from the start
+ defer syscall.Umask(oldUmask)
+
etcdConfig, err := endpoint.Listen(ctx, s.KineConfig)
if err != nil {
return fmt.Errorf("failed to start storage backend: %w", err)
@@ -114,7 +137,7 @@ func (s *Storage) run(ctx context.Context) error {
socketPath := strings.TrimPrefix(s.KineSocketPath, "unix://")
defer func() {
if err := netutils.CleanupUDS(socketPath); err != nil {
- klog.V(2).ErrorS(err, "Failed to cleanup socket", "path", socketPath)
+ klog.ErrorS(err, "Failed to cleanup kine socket", "path", socketPath)
}
}()
@@ -157,8 +180,14 @@ func (s *Storage) waitForSocket(ctx context.Context) error {
}
conn.Close() //nolint:wsl_v5
+ //nolint:gosec // G302: 0660 intentional — server and provider share a group
if err := os.Chmod(socketPath, 0660); err != nil {
+ if os.IsPermission(err) {
+ return false, fmt.Errorf("failed to secure kine socket %q: %w", socketPath, err)
+ }
+
logger.V(4).Error(err, "Failed to secure socket, retrying", "path", socketPath)
+
return false, nil
}
@@ -169,8 +198,6 @@ func (s *Storage) waitForSocket(ctx context.Context) error {
return fmt.Errorf("timed out waiting to connect to socket: %w", err)
}
- s.isReady.Store(true)
-
return nil
}
diff --git a/pkg/storage/storagebackend/storage_test.go b/pkg/storage/storagebackend/storage_test.go
index b992d0602..7d446eadf 100644
--- a/pkg/storage/storagebackend/storage_test.go
+++ b/pkg/storage/storagebackend/storage_test.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -113,6 +113,43 @@ func TestStorage_SocketInUse(t *testing.T) {
}
}
+func TestStorage_InMemoryMode(t *testing.T) {
+ s := &Storage{InMemory: true}
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ ps, err := s.PrepareRun(ctx)
+ if err != nil {
+ t.Fatalf("PrepareRun failed: %v", err)
+ }
+
+ runErr := make(chan error, 1)
+ go func() {
+ runErr <- ps.Run(ctx)
+ }()
+
+ // In-memory should become ready almost immediately.
+ waitErr := wait.PollUntilContextTimeout(ctx, 10*time.Millisecond, 2*time.Second, true, func(ctx context.Context) (bool, error) {
+ return s.IsReady(), nil
+ })
+ if waitErr != nil {
+ t.Fatal("In-memory storage did not become ready")
+ }
+
+ cancel()
+
+ select {
+ case <-runErr:
+ case <-time.After(2 * time.Second):
+ t.Error("In-memory storage did not shut down gracefully")
+ }
+
+ if s.IsReady() {
+ t.Error("In-memory storage should not be ready after shutdown")
+ }
+}
+
func TestStorage_WaitForSocket_Timeout(t *testing.T) {
socketPath := testutils.NewUnixAddr(t)
socketURL := "unix://" + socketPath
diff --git a/pkg/testutil/grpcserver.go b/pkg/testutil/grpcserver.go
new file mode 100644
index 000000000..3e9971474
--- /dev/null
+++ b/pkg/testutil/grpcserver.go
@@ -0,0 +1,118 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testutil provides shared test infrastructure for gRPC integration tests.
+package testutil
+
+import (
+ "context"
+ "net"
+ "testing"
+
+ clientset "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned"
+ gpuclient "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned/typed/device/v1alpha1"
+
+ pb "github.com/nvidia/nvsentinel/internal/generated/device/v1alpha1"
+ svc "github.com/nvidia/nvsentinel/pkg/services/device/v1alpha1"
+ "google.golang.org/grpc"
+ "google.golang.org/grpc/credentials/insecure"
+ "google.golang.org/grpc/test/bufconn"
+ apistorage "k8s.io/apiserver/pkg/storage/storagebackend"
+)
+
+// NewTestGPUClient creates a bufconn-backed gRPC client for testing.
+// It spins up a real gRPC server with the GPU service backed by in-memory storage.
+// All resources are cleaned up when t finishes.
+func NewTestGPUClient(t *testing.T) pb.GpuServiceClient {
+ t.Helper()
+
+ lis := bufconn.Listen(1024 * 1024)
+ srv := grpc.NewServer()
+
+ provider := svc.NewGPUServiceProvider()
+ service, err := provider.Install(srv, apistorage.Config{})
+ if err != nil {
+ t.Fatalf("failed to install GPU service: %v", err)
+ }
+
+ go func() {
+ if err := srv.Serve(lis); err != nil {
+ t.Logf("server stopped: %v", err)
+ }
+ }()
+
+ conn, err := grpc.NewClient(
+ "passthrough:///bufconn",
+ grpc.WithContextDialer(func(context.Context, string) (net.Conn, error) {
+ return lis.Dial()
+ }),
+ grpc.WithTransportCredentials(insecure.NewCredentials()),
+ )
+ if err != nil {
+ t.Fatalf("failed to create gRPC client: %v", err)
+ }
+
+ t.Cleanup(func() {
+ conn.Close()
+ service.Cleanup()
+ srv.Stop()
+ lis.Close()
+ })
+
+ return pb.NewGpuServiceClient(conn)
+}
+
+// NewTestGPUTypedClient creates a bufconn-backed typed GPU client for testing.
+// It spins up a real gRPC server with the GPU service backed by in-memory storage,
+// and returns a GPUInterface from the generated client SDK.
+// All resources are cleaned up when t finishes.
+func NewTestGPUTypedClient(t *testing.T) gpuclient.GPUInterface {
+ t.Helper()
+
+ lis := bufconn.Listen(1024 * 1024)
+ srv := grpc.NewServer()
+
+ provider := svc.NewGPUServiceProvider()
+ service, err := provider.Install(srv, apistorage.Config{})
+ if err != nil {
+ t.Fatalf("failed to install GPU service: %v", err)
+ }
+
+ go func() {
+ if err := srv.Serve(lis); err != nil {
+ t.Logf("server stopped: %v", err)
+ }
+ }()
+
+ conn, err := grpc.NewClient(
+ "passthrough:///bufconn",
+ grpc.WithContextDialer(func(context.Context, string) (net.Conn, error) {
+ return lis.Dial()
+ }),
+ grpc.WithTransportCredentials(insecure.NewCredentials()),
+ )
+ if err != nil {
+ t.Fatalf("failed to create gRPC client: %v", err)
+ }
+
+ t.Cleanup(func() {
+ conn.Close()
+ service.Cleanup()
+ srv.Stop()
+ lis.Close()
+ })
+
+ cs := clientset.New(conn)
+ return cs.DeviceV1alpha1().GPUs()
+}
diff --git a/pkg/testutil/grpcserver_test.go b/pkg/testutil/grpcserver_test.go
new file mode 100644
index 000000000..460f3c489
--- /dev/null
+++ b/pkg/testutil/grpcserver_test.go
@@ -0,0 +1,57 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutil
+
+import (
+ "testing"
+
+ pb "github.com/nvidia/nvsentinel/internal/generated/device/v1alpha1"
+)
+
+func TestNewTestGPUClient_CreateAndGet(t *testing.T) {
+ client := NewTestGPUClient(t)
+ ctx := t.Context()
+
+ const gpuName = "GPU-01234567-89ab-cdef-0123-456789abcdef"
+
+ created, err := client.CreateGpu(ctx, &pb.CreateGpuRequest{
+ Gpu: &pb.Gpu{
+ Metadata: &pb.ObjectMeta{
+ Name: gpuName,
+ Namespace: "default",
+ },
+ Spec: &pb.GpuSpec{
+ Uuid: "GPU-TEST-1",
+ },
+ },
+ })
+ if err != nil {
+ t.Fatalf("CreateGpu failed: %v", err)
+ }
+ if created.GetMetadata().GetName() != gpuName {
+ t.Errorf("expected name %q, got %q", gpuName, created.GetMetadata().GetName())
+ }
+
+ resp, err := client.GetGpu(ctx, &pb.GetGpuRequest{
+ Name: gpuName,
+ Namespace: "default",
+ })
+ if err != nil {
+ t.Fatalf("GetGpu failed: %v", err)
+ }
+ if resp.GetGpu().GetSpec().GetUuid() != "GPU-TEST-1" {
+ t.Errorf("expected UUID %q, got %q", "GPU-TEST-1", resp.GetGpu().GetSpec().GetUuid())
+ }
+}
diff --git a/pkg/util/net/uds.go b/pkg/util/net/uds.go
index 1083f4352..25072e73b 100644
--- a/pkg/util/net/uds.go
+++ b/pkg/util/net/uds.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -55,6 +55,9 @@ func CreateUDSListener(ctx context.Context, socketPath string, perm os.FileMode)
lc := net.ListenConfig{}
+ // Note: There is a residual TOCTOU window between CleanupUDS and Listen.
+ // This is acceptable because Listen will fail with EADDRINUSE if another
+ // process binds the socket in that window.
lis, err := lc.Listen(ctx, "unix", socketPath)
if err != nil {
return nil, nil, fmt.Errorf("failed to listen on unix socket %q: %w", socketPath, err)
diff --git a/pkg/util/verflag/verflag.go b/pkg/util/verflag/verflag.go
index 592a41f71..1dae5d3b9 100644
--- a/pkg/util/verflag/verflag.go
+++ b/pkg/util/verflag/verflag.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@ import (
"strconv"
"text/tabwriter"
- "github.com/nvidia/nvsentinel/pkg/util/version"
+ "github.com/nvidia/nvsentinel/pkg/version"
"github.com/spf13/pflag"
)
@@ -111,7 +111,7 @@ func printVersionTable() {
fmt.Fprintf(w, "%s\n", programName)
fmt.Fprintf(w, "---\t---\n")
- fmt.Fprintf(w, "Version\t%s\n", v.GitVersion)
+ fmt.Fprintf(w, "Version\t%s\n", v.Version)
fmt.Fprintf(w, "GitCommit\t%s\n", v.GitCommit)
fmt.Fprintf(w, "BuildDate\t%s\n", v.BuildDate)
fmt.Fprintf(w, "GoVersion\t%s\n", v.GoVersion)
diff --git a/pkg/util/version/version.go b/pkg/util/version/version.go
deleted file mode 100644
index dac336d55..000000000
--- a/pkg/util/version/version.go
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package version
-
-import (
- "encoding/json"
- "fmt"
- "net/http"
- "runtime"
-
- utilversion "k8s.io/apimachinery/pkg/util/version"
- "k8s.io/component-base/compatibility"
-)
-
-var (
- GitVersion = "v0.0.0-devel"
- GitCommit = "unknown"
- BuildDate = "unknown"
-)
-
-type Info struct {
- GitVersion string
- GitCommit string
- BuildDate string
- GoVersion string
- Compiler string
- Platform string
-}
-
-func Get() Info {
- return Info{
- GitVersion: GitVersion,
- GitCommit: GitCommit,
- BuildDate: BuildDate,
- GoVersion: runtime.Version(),
- Compiler: runtime.Compiler,
- Platform: fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH),
- }
-}
-
-func (i Info) String() string {
- return i.GitVersion
-}
-
-// UserAgent returns the standard user agent string for clients.
-func UserAgent() string {
- return fmt.Sprintf("nvidia-device-api/%s (%s)", GitVersion, Get().Platform)
-}
-
-func RegisterComponent(registry compatibility.ComponentGlobalsRegistry) error {
- v, err := utilversion.ParseSemantic(GitVersion)
- if err != nil {
- v = utilversion.MustParseSemantic("v0.0.1")
- }
-
- binaryVersion := v
- emulationVersion := v
- minCompatibilityVersion := v
-
- effectiveVer := compatibility.NewEffectiveVersion(
- binaryVersion,
- false,
- emulationVersion,
- minCompatibilityVersion,
- )
-
- if err := registry.Register("nvidia-device-api", effectiveVer, nil); err != nil {
- return fmt.Errorf("failed to register component with compatibility registry: %w", err)
- }
-
- return nil
-}
-
-func Handler() http.Handler {
- return http.HandlerFunc(versionHandler)
-}
-
-func versionHandler(w http.ResponseWriter, r *http.Request) {
- w.Header().Set("Content-Type", "application/json")
- w.WriteHeader(http.StatusOK)
- _ = json.NewEncoder(w).Encode(Get())
-}
diff --git a/pkg/util/version/version_test.go b/pkg/util/version/version_test.go
deleted file mode 100644
index 3548c63d9..000000000
--- a/pkg/util/version/version_test.go
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package version
-
-import (
- "strings"
- "testing"
-
- "k8s.io/component-base/compatibility"
-)
-
-func TestGet(t *testing.T) {
- info := Get()
-
- if info.GitVersion != GitVersion {
- t.Errorf("expected GitVersion %s, got %s", GitVersion, info.GitVersion)
- }
-
- if info.GoVersion == "" || info.Platform == "" {
- t.Error("runtime info (GoVersion/Platform) should not be empty")
- }
-}
-
-func TestUserAgent(t *testing.T) {
- ua := UserAgent()
- expectedPrefix := "nvidia-device-api/" + GitVersion
-
- if !strings.HasPrefix(ua, expectedPrefix) {
- t.Errorf("UserAgent %s does not start with %s", ua, expectedPrefix)
- }
-}
-
-func TestRegisterComponent(t *testing.T) {
- tests := []struct {
- name string
- gitVersion string
- }{
- {
- name: "valid semver",
- gitVersion: "v1.2.3",
- },
- {
- name: "invalid semver uses fallback",
- gitVersion: "development-build",
- },
- }
-
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- oldVersion := GitVersion
- GitVersion = tt.gitVersion
- defer func() { GitVersion = oldVersion }()
-
- registry := compatibility.NewComponentGlobalsRegistry()
-
- defer func() {
- if r := recover(); r != nil {
- t.Errorf("RegisterComponent panicked for version %s: %v", tt.gitVersion, r)
- }
- }()
-
- RegisterComponent(registry)
-
- effective := registry.EffectiveVersionFor("nvidia-device-api")
- if effective == nil {
- t.Fatal("component was not registered in the registry")
- }
-
- if effective.BinaryVersion() == nil {
- t.Error("EffectiveVersion has nil BinaryVersion")
- }
- })
- }
-}
diff --git a/pkg/version/version.go b/pkg/version/version.go
new file mode 100644
index 000000000..f2f31aa6f
--- /dev/null
+++ b/pkg/version/version.go
@@ -0,0 +1,98 @@
+// Copyright (c) 2026-2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package version provides version information for the Device API Server.
+// These values are set at build time via ldflags.
+package version
+
+import (
+ "encoding/json"
+ "fmt"
+ "net/http"
+ "runtime"
+)
+
+// Build information set at compile time via -ldflags.
+var (
+ // Version is the semantic version of the build.
+ Version = "dev"
+
+ // GitCommit is the git commit SHA at build time.
+ GitCommit = "unknown"
+
+ // GitTreeState indicates if the git tree was clean or dirty.
+ GitTreeState = "unknown"
+
+ // BuildDate is the date of the build in ISO 8601 format.
+ BuildDate = "unknown"
+)
+
+// Info contains version information.
+type Info struct {
+ Version string `json:"version"`
+ GitCommit string `json:"gitCommit"`
+ GitTreeState string `json:"gitTreeState"`
+ BuildDate string `json:"buildDate"`
+ GoVersion string `json:"goVersion"`
+ Compiler string `json:"compiler"`
+ Platform string `json:"platform"`
+}
+
+// Get returns the version information.
+func Get() Info {
+ return Info{
+ Version: Version,
+ GitCommit: GitCommit,
+ GitTreeState: GitTreeState,
+ BuildDate: BuildDate,
+ GoVersion: runtime.Version(),
+ Compiler: runtime.Compiler,
+ Platform: fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH),
+ }
+}
+
+// String returns version information as a human-readable string.
+func (i Info) String() string {
+ return fmt.Sprintf(
+ "Version: %s\nGit Commit: %s\nGit Tree State: %s\nBuild Date: %s\nGo Version: %s\nCompiler: %s\nPlatform: %s",
+ i.Version,
+ i.GitCommit,
+ i.GitTreeState,
+ i.BuildDate,
+ i.GoVersion,
+ i.Compiler,
+ i.Platform,
+ )
+}
+
+// Short returns a short version string.
+func (i Info) Short() string {
+ return fmt.Sprintf("%s (%s)", i.Version, i.GitCommit)
+}
+
+// UserAgent returns the standard user agent string for clients.
+func UserAgent() string {
+ return fmt.Sprintf("nvidia-device-api/%s (%s)", Version, Get().Platform)
+}
+
+// Handler returns an HTTP handler that responds with version information as JSON.
+func Handler() http.Handler {
+ return http.HandlerFunc(versionHandler)
+}
+
+func versionHandler(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(http.StatusOK)
+ _ = json.NewEncoder(w).Encode(Get())
+}
diff --git a/pkg/version/version_test.go b/pkg/version/version_test.go
new file mode 100644
index 000000000..78c66358e
--- /dev/null
+++ b/pkg/version/version_test.go
@@ -0,0 +1,68 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package version
+
+import (
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+)
+
+func TestGet(t *testing.T) {
+ info := Get()
+
+ if info.Version != Version {
+ t.Errorf("expected Version %s, got %s", Version, info.Version)
+ }
+
+ if info.GoVersion == "" || info.Platform == "" {
+ t.Error("runtime info (GoVersion/Platform) should not be empty")
+ }
+}
+
+func TestUserAgent(t *testing.T) {
+ ua := UserAgent()
+ expectedPrefix := "nvidia-device-api/" + Version
+
+ if !strings.HasPrefix(ua, expectedPrefix) {
+ t.Errorf("UserAgent %s does not start with %s", ua, expectedPrefix)
+ }
+}
+
+func TestHandler(t *testing.T) {
+ req := httptest.NewRequest(http.MethodGet, "/version", nil)
+ w := httptest.NewRecorder()
+
+ Handler().ServeHTTP(w, req)
+
+ if w.Code != http.StatusOK {
+ t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
+ }
+
+ if ct := w.Header().Get("Content-Type"); ct != "application/json" {
+ t.Errorf("expected Content-Type application/json, got %s", ct)
+ }
+
+ var info Info
+ if err := json.NewDecoder(w.Body).Decode(&info); err != nil {
+ t.Fatalf("failed to decode response body: %v", err)
+ }
+
+ if info.Version != Version {
+ t.Errorf("expected version %s in response, got %s", Version, info.Version)
+ }
+}
diff --git a/test/integration/client-go/device/v1alpha1/clientset_test.go b/test/integration/client-go/device/v1alpha1/clientset_test.go
deleted file mode 100644
index 6745e3003..000000000
--- a/test/integration/client-go/device/v1alpha1/clientset_test.go
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package v1alpha1_test
-
-import (
- "context"
- "encoding/json"
- "fmt"
- "strconv"
- "testing"
- "time"
-
- devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1"
- "github.com/nvidia/nvsentinel/cmd/device-apiserver/app"
- "github.com/nvidia/nvsentinel/cmd/device-apiserver/app/options"
- "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned"
- "github.com/nvidia/nvsentinel/pkg/grpc/client"
- "github.com/nvidia/nvsentinel/pkg/util/testutils"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-)
-
-func TestEndToEnd(t *testing.T) {
- ctx, cancel := context.WithCancel(context.Background())
- defer cancel()
-
- tmpDir := t.TempDir()
-
- socketPath := testutils.NewUnixAddr(t)
- kineSocket := fmt.Sprintf("unix://%s", testutils.NewUnixAddr(t))
- healthAddr := testutils.GetFreeTCPAddress(t)
-
- opts := options.NewServerRunOptions()
- opts.NodeName = "test-node"
- opts.GRPC.BindAddress = "unix://" + socketPath
- opts.HealthAddress = healthAddr
- opts.Storage.DatabaseDir = tmpDir
- opts.Storage.DatabasePath = tmpDir + "state.db"
- opts.Storage.KineSocketPath = kineSocket
- opts.Storage.KineConfig.Endpoint = fmt.Sprintf("sqlite://%s/db.sqlite", tmpDir)
- opts.Storage.KineConfig.Listener = kineSocket
-
- completed, err := opts.Complete(ctx)
- if err != nil {
- t.Fatalf("Failed to complete options: %v", err)
- }
-
- go func() {
- if err := app.Run(ctx, completed); err != nil && err != context.Canceled {
- t.Errorf("Server exited with error: %v", err)
- }
- }()
-
- testutils.WaitForStatus(t, healthAddr, "", 5*time.Second, testutils.IsServing)
-
- config := &client.Config{Target: "unix://" + socketPath}
- client, err := versioned.NewForConfig(config)
- if err != nil {
- t.Fatalf("Failed to create clientset: %v", err)
- }
-
- var created *devicev1alpha1.GPU
-
- t.Run("Create", func(t *testing.T) {
- gpu := &devicev1alpha1.GPU{
- ObjectMeta: metav1.ObjectMeta{
- Name: "gpu-ad2367dd-a40e-6b86-6fc3-c44a2cc92c7e",
- },
- Spec: devicev1alpha1.GPUSpec{
- UUID: "GPU-ad2367dd-a40e-6b86-6fc3-c44a2cc92c7e",
- },
- Status: devicev1alpha1.GPUStatus{
- Conditions: []metav1.Condition{
- {
- Type: "Ready",
- Status: metav1.ConditionFalse,
- Reason: "DriverNotReaady",
- Message: "Driver is posting ready status",
- },
- },
- },
- }
-
- created, err = client.DeviceV1alpha1().GPUs().Create(ctx, gpu, metav1.CreateOptions{})
- if err != nil {
- t.Fatalf("Failed to create GPU: %v", err)
- }
-
- // Client generated fields
- if created.Kind != "GPU" {
- t.Errorf("expected kind 'GPU', got %s", created.Kind)
- }
- if created.APIVersion != devicev1alpha1.SchemeGroupVersion.String() {
- t.Errorf("expected version %s, got %s", devicev1alpha1.SchemeGroupVersion.String(), created.APIVersion)
- }
-
- // Server generated fields
- if created.Namespace != "default" {
- t.Error("Server failed to set default namespace")
- }
- if created.UID == "" {
- t.Error("Server failed to generate a UID for the GPU")
- }
- if created.ResourceVersion == "" {
- t.Error("Server failed to generate a ResourceVersion")
- }
- if created.Generation != 1 {
- t.Error("Server failed to set initial Generation")
- }
- if created.CreationTimestamp.IsZero() {
- t.Error("Server failed to set a CreationTimestamp")
- }
-
- // Data integrity
- if created.Name != gpu.Name {
- t.Errorf("expected name %q, got %q", gpu.Name, created.Name)
- }
- if created.Spec.UUID != gpu.Spec.UUID {
- t.Errorf("expected UUID %q, got %q", gpu.Spec.UUID, created.Spec.UUID)
- }
-
- // Data integrity: Status
- if len(created.Status.Conditions) != len(gpu.Status.Conditions) {
- t.Fatalf("expected %d conditions, got %d", len(gpu.Status.Conditions), len(created.Status.Conditions))
- }
-
- cond := created.Status.Conditions[0]
- expected := gpu.Status.Conditions[0]
-
- if cond.Type != expected.Type {
- t.Errorf("expected condition Type %q, got %q", expected.Type, cond.Type)
- }
- if cond.Status != expected.Status {
- t.Errorf("expected condition Status %q, got %q", expected.Status, cond.Status)
- }
- if cond.Reason != expected.Reason {
- t.Errorf("expected condition Reason %q, got %q", expected.Reason, cond.Reason)
- }
- if cond.Message != expected.Message {
- t.Errorf("expected condition Message %q, got %q", expected.Message, cond.Message)
- }
- if cond.LastTransitionTime.IsZero() {
- t.Error("condition LastTransitionTime should not be zero")
- }
-
- // TODO: remove
- objJson, _ := json.MarshalIndent(created, "", " ")
- fmt.Printf("\n--- [Object After Creation] ---\n%s\n", string(objJson))
- })
-
- t.Run("Update", func(t *testing.T) {
- if created == nil {
- t.Skip("Skipping: Create failed")
- }
-
- toUpdate := created.DeepCopy()
- toUpdate.Spec.UUID = "GPU-cd2367dd-a40e-6b86-6fc3-c44a2cc92c7d"
-
- updated, err := client.DeviceV1alpha1().GPUs().Update(ctx, toUpdate, metav1.UpdateOptions{})
- if err != nil {
- t.Fatalf("Failed to update GPU: %v", err)
- }
-
- if updated.Spec.UUID != toUpdate.Spec.UUID {
- t.Errorf("expected UUID %q, got %q", toUpdate.Spec.UUID, updated.Spec.UUID)
- }
-
- oldRV, _ := strconv.ParseInt(created.ResourceVersion, 10, 64)
- updatedRV, _ := strconv.ParseInt(updated.ResourceVersion, 10, 64)
-
- if updatedRV <= oldRV {
- t.Errorf("expected ResourceVersion to increase, got %d (old) and %d (new)", oldRV, updatedRV)
- }
-
- if updated.Generation <= created.Generation {
- t.Errorf("expected Generation to increase, got %d (old) and %d (new)", created.Generation, updated.Generation)
- }
-
- // TODO: remove
- objJson, _ := json.MarshalIndent(updated, "", " ")
- fmt.Printf("\n--- [Object After Update] ---\n%s\n", string(objJson))
- })
-
- // TODO: add tests for Delete, List, Watch
-}