diff --git a/.github/headers/LICENSE b/.github/headers/LICENSE index a3f12d28d..7760ae7c6 100644 --- a/.github/headers/LICENSE +++ b/.github/headers/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/.gitignore b/.gitignore index d19ccad4e..0c9fbd23a 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,13 @@ code-quality-report.json go.work go.work.sum +# Local tool binaries (managed by api/Makefile) +api/bin/* + +# Server binary output +bin/ +/device-api-server + # ============================================================================== # IDE & Editor Configurations # ============================================================================== @@ -48,3 +55,9 @@ go.work.sum # Emacs *~ \#*\# + + +# ============================================================================== +# Git Worktrees +# ============================================================================== +.worktrees/ diff --git a/.versions.yaml b/.versions.yaml index 122a33f86..15a409121 100644 --- a/.versions.yaml +++ b/.versions.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,6 +34,6 @@ go_tools: # Protocol Buffers / gRPC protobuf: - protobuf: 'v33.0' + protobuf: 'v33.4' protoc_gen_go: 'v1.36.10' protoc_gen_go_grpc: 'v1.5.1' diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 9dbdcf56a..e7e16ac6f 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -1,18 +1,130 @@ -# Development Guide +# NVIDIA Device API: Development Guide + +This guide covers the development setup and workflows for contributing to the NVIDIA Device API. + +## Module Structure + +This repository is a multi-module monorepo containing multiple Go modules: + +| Module | Path | Description | +|--------|------|-------------| +| `github.com/nvidia/nvsentinel` | `/` | Device API Server implementation | +| `github.com/nvidia/nvsentinel/api` | `/api` | API definitions (protobuf and Go types) | +| `github.com/nvidia/nvsentinel/client-go` | `/client-go` | Kubernetes-style gRPC clients | +| `github.com/nvidia/nvsentinel/code-generator` | `/code-generator` | Code generation tools | + +The API module is designed to be imported independently by consumers who only need the type definitions. + +## Architecture + +This project bridges **gRPC** (for node-local performance) with **Kubernetes API Machinery** (for developer experience). + +1. **Definitions**: `api/proto` (Wire format) and `api/device` (Go types). +2. **Conversion**: `api/device/${version}/converter.go` maps gRPC messages to K8s-style structs. +3. **Generation**: A pipeline driven by `code-generator/kube_codegen.sh`, which utilizes a modified `client-gen` to produce gRPC-backed Kubernetes clients in the `client-go` module. + +--- + +## Code Generation Pipeline + +The NVIDIA Device API uses a multi-stage pipeline to bridge gRPC with Kubernetes API machinery. For module-specific details, see the [client-go Development Guide](./client-go/DEVELOPMENT.md). + +```mermaid +graph TD + API["API Definitions
(nvidia/nvsentinel/api)"] -->|Input| CG(client-gen
*Custom Build*) + API -->|Input| LG(lister-gen) + + CG -->|Generates| CLIENT[client/versioned] + LG -->|Generates| LISTERS[listers/] + + CLIENT & LISTERS -->|Input| IG(informer-gen) + IG -->|Generates| INFORMERS[informers/] + + CLIENT & LISTERS & INFORMERS -->|Final Output| SDK[Ready-to-use SDK] +``` + +### Build Sequence + +When you run `make code-gen` from the root, the following sequence is executed: + +1. **Protoc**: Compiles `.proto` into Go gRPC stubs in `api/gen/`. +2. **DeepCopy**: Generates `runtime.Object` methods required for K8s compatibility. +3. **Goverter**: Generates type conversion logic between Protobuf and Go structs. +4. **Custom client-gen**: Orchestrated by `code-generator/kube_codegen.sh` to produce the versioned Clientset, Informers, and Listers in `client-go/`. + +--- + +## Development Workflow + +1. **Modify**: Edit the Protobuf definitions in `api/proto` or Go types in `api/device`. +2. **Update**: Update the conversion logic in `api/device/${version}/converter.go` to handle changes, if necessary. +3. **Generate**: Run `make code-gen` from the root. This updates the gRPC stubs, helper methods, and the `client-go` SDK. +4. **Verify**: Run `make verify-codegen` to ensure the workspace is consistent. +5. **Test**: Add tests to the affected module and run `make test` from the root. + +> [!NOTE] Use the fake clients in `client-go/client/versioned/fake` for testing controllers without a real gRPC server. + +--- + +## Code Standards & Compliance + +### Commit Messages & Signing (DCO) + +We follow the [Conventional Commits](https://www.conventionalcommits.org) specification. Additionally, all commits **must** be signed off to comply with the Developer Certificate of Origin (DCO). + +```bash +# Example: feat, fix, docs, chore, refactor +git commit -s -m "feat: add new GPU condition type" +``` + +### License Headers + +Every source file (.go, .proto, .sh, Makefile) must include the Apache 2.0 license header. + +- **Go/Proto Template**: See `api/hack/boilerplate.go.txt`. +- **Year**: Ensure the copyright year is current. --- -## Code Generation +## Troubleshooting -This project relies heavily on generated code to ensure consistency with the Kubernetes API machinery. +### Tooling Not Found + +We use `.versions.yaml` to pin tool versions. Our Makefile attempts to use tools from your system path or download them to your Go bin directory. + +- **Verify Installation**: `which protoc` or `which yq`. +- **Fix**: Ensure your `GOPATH/bin` is in your system `$PATH`: + ```bash + export PATH=$PATH:$(go env GOPATH)/bin + ``` + +### Generated Code Out of Sync + +If the build fails or `make verify-codegen` returns an error, your generated artifacts are likely stale. + +```bash +# Clean all generated files across the monorepo +make clean + +# Re-run the full pipeline +make code-gen +``` + +### Dependency Issues + +If you see "module not found" or checksum errors: + +```bash +# Tidy all modules +make tidy +``` + +--- -### Generation Pipeline -The `make code-gen` command orchestrates several tools: +## Getting Help -1. **Protoc**: Generates gRPC Go bindings from `api/proto`. -2. **Goverter**: Generates type-safe conversion logic between internal gRPC types and the Kubernetes-style API types defined in `api/device/`. -3. **K8s Code-Gen**: - - Generates `DeepCopy` methods for API types to support standard Kubernetes object manipulation. - - Generates a versioned, typed **clientset**, along with **listers** and **informers**, providing a native `client-go` experience for consumers. +- **Issues**: [Create an issue](https://github.com/NVIDIA/device-api/issues/new) +- **Questions**: [Start a discussion](https://github.com/NVIDIA/device-api/discussions) +- **Security**: Please refer to [SECURITY](SECURITY.md) for reporting vulnerabilities. --- diff --git a/Makefile b/Makefile index 79e7c5567..1dba8bcf7 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,14 +21,28 @@ SHELL = /usr/bin/env bash -o pipefail .SHELLFLAGS = -ec -VERSION_PKG = github.com/nvidia/nvsentinel/pkg/util/version -GIT_VERSION := $(shell git describe --tags --always --dirty) -GIT_COMMIT := $(shell git rev-parse HEAD) -BUILD_DATE := $(shell date -u +'%Y-%m-%dT%H:%M:%SZ') - -LDFLAGS := -X $(VERSION_PKG).GitVersion=$(GIT_VERSION) \ - -X $(VERSION_PKG).GitCommit=$(GIT_COMMIT) \ - -X $(VERSION_PKG).BuildDate=$(BUILD_DATE) +# Go build settings +GOOS ?= $(shell go env GOOS) +GOARCH ?= $(shell go env GOARCH) +VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo "dev") +GIT_COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo "unknown") +GIT_TREE_STATE ?= $(shell if git diff --quiet 2>/dev/null; then echo "clean"; else echo "dirty"; fi) +BUILD_DATE ?= $(shell date -u +"%Y-%m-%dT%H:%M:%SZ") + +# Version package path for ldflags +VERSION_PKG = github.com/nvidia/nvsentinel/pkg/version + +# Container settings +CONTAINER_RUNTIME ?= docker +IMAGE_REGISTRY ?= ghcr.io/nvidia/nvsentinel +DOCKERFILE := deployments/container/Dockerfile + +# Linker flags +LDFLAGS = -s -w \ + -X $(VERSION_PKG).Version=$(VERSION) \ + -X $(VERSION_PKG).GitCommit=$(GIT_COMMIT) \ + -X $(VERSION_PKG).GitTreeState=$(GIT_TREE_STATE) \ + -X $(VERSION_PKG).BuildDate=$(BUILD_DATE) # ============================================================================== # Targets @@ -59,34 +73,134 @@ verify-codegen: code-gen ## Verify generated code is up-to-date. exit 1; \ fi -.PHONY: tidy -tidy: ## Run go mod tidy - go mod tidy - -##@ Build & Test +##@ Build .PHONY: build -build: ## Build the device-apiserver binary. - go build -ldflags "$(LDFLAGS)" -o bin/device-apiserver ./cmd/device-apiserver +build: build-modules build-server ## Build all modules and server. + +.PHONY: build-modules +build-modules: ## Build all modules. + @for mod in $(MODULES); do \ + if [ -f $$mod/Makefile ]; then \ + $(MAKE) -C $$mod build; \ + fi \ + done + +.PHONY: build-server +build-server: ## Build the Device API Server + @echo "Building device-api-server..." + @mkdir -p bin + CGO_ENABLED=0 GOOS=$(GOOS) GOARCH=$(GOARCH) go build \ + -ldflags "$(LDFLAGS)" \ + -o bin/device-api-server \ + ./cmd/device-api-server + @echo "Built bin/device-api-server" + +.PHONY: build-nvml-provider +build-nvml-provider: ## Build the NVML Provider sidecar (requires CGO) + @echo "Building nvml-provider..." + @mkdir -p bin + CGO_ENABLED=1 GOOS=$(GOOS) GOARCH=$(GOARCH) go build \ + -tags=nvml \ + -ldflags "$(LDFLAGS)" \ + -o bin/nvml-provider \ + ./cmd/nvml-provider + @echo "Built bin/nvml-provider" + +##@ Testing .PHONY: test -test: ## Run unit tests. - GOTOOLCHAIN=go1.25.5+auto go test -v $$(go list ./... | grep -vE '/pkg/client-go/(client|informers|listers)|/internal/generated/|/test/integration/|/examples/') -cover cover.out +test: test-modules test-server ## Run tests in all modules. + +.PHONY: test-modules +test-modules: ## Run tests in all modules. + @for mod in $(MODULES); do \ + if [ -f $$mod/Makefile ]; then \ + $(MAKE) -C $$mod test; \ + fi \ + done + +.PHONY: test-server +test-server: ## Run server tests only + go test -race -v ./pkg/... .PHONY: test-integration -test-integration: ## Run integration tests. +test-integration: ## Run integration tests go test -v ./test/integration/... +##@ Linting + .PHONY: lint -lint: ## Run golangci-lint. - golangci-lint run ./... +lint: ## Run linting on all modules. + @for mod in $(MODULES); do \ + if [ -f $$mod/Makefile ]; then \ + $(MAKE) -C $$mod lint; \ + fi \ + done + go vet ./... + +##@ Container Images + +.PHONY: docker-build +docker-build: docker-build-server docker-build-nvml-provider ## Build all container images + +.PHONY: docker-build-server +docker-build-server: ## Build device-api-server container image + $(CONTAINER_RUNTIME) build \ + --target device-api-server \ + --build-arg VERSION=$(VERSION) \ + --build-arg GIT_COMMIT=$(GIT_COMMIT) \ + --build-arg GIT_TREE_STATE=$(GIT_TREE_STATE) \ + --build-arg BUILD_DATE=$(BUILD_DATE) \ + -t $(IMAGE_REGISTRY)/device-api-server:$(VERSION) \ + -f $(DOCKERFILE) . + +.PHONY: docker-build-nvml-provider +docker-build-nvml-provider: ## Build nvml-provider container image + $(CONTAINER_RUNTIME) build \ + --target nvml-provider \ + --build-arg VERSION=$(VERSION) \ + --build-arg GIT_COMMIT=$(GIT_COMMIT) \ + --build-arg GIT_TREE_STATE=$(GIT_TREE_STATE) \ + --build-arg BUILD_DATE=$(BUILD_DATE) \ + -t $(IMAGE_REGISTRY)/nvml-provider:$(VERSION) \ + -f $(DOCKERFILE) . + +.PHONY: docker-push +docker-push: ## Push all container images + $(CONTAINER_RUNTIME) push $(IMAGE_REGISTRY)/device-api-server:$(VERSION) + $(CONTAINER_RUNTIME) push $(IMAGE_REGISTRY)/nvml-provider:$(VERSION) + +##@ Helm + +.PHONY: helm-lint +helm-lint: ## Lint Helm chart + helm lint deployments/helm/device-api-server + +.PHONY: helm-template +helm-template: ## Render Helm chart templates + helm template device-api-server deployments/helm/device-api-server + +.PHONY: helm-package +helm-package: ## Package Helm chart + @mkdir -p dist/ + helm package deployments/helm/device-api-server -d dist/ + +##@ Cleanup .PHONY: clean -clean: ## Remove generated artifacts. - @echo "Cleaning generated artifacts..." +clean: ## Clean generated artifacts in all modules. + @for mod in $(MODULES); do \ + if [ -f $$mod/Makefile ]; then \ + $(MAKE) -C $$mod clean; \ + fi \ + done rm -rf bin/ - rm -rf internal/generated/ - rm -rf pkg/client-go/client/ pkg/client-go/informers/ pkg/client-go/listers/ - find api/ -name "zz_generated.deepcopy.go" -delete - find api/ -name "zz_generated.goverter.go" -delete - rm -f cover.out + +.PHONY: tidy +tidy: ## Run go mod tidy on all modules. + @for mod in $(MODULES); do \ + echo "Tidying $$mod..."; \ + (cd $$mod && go mod tidy); \ + done + go mod tidy diff --git a/README.md b/README.md index b7bbfc818..fcaf95767 100644 --- a/README.md +++ b/README.md @@ -1,56 +1,169 @@ # NVIDIA Device API -**The NVIDIA Device API allows you to query and manipulate the state of node-local resources (such as GPUs) in Kubernetes**. Unlike the cluster-wide Kubernetes API, the Device API operates exclusively at the node level. +The NVIDIA Device API provides a Kubernetes-idiomatic Go SDK and Protobuf definitions for interacting with NVIDIA device resources. -The core control plane is the Device API server and the gRPC API that it exposes. Node-level agents, local monitoring tools, and external components communicate with one another through this node-local Device API server rather than the central Kubernetes control plane. +**Node-local GPU device state management for Kubernetes** -NVIDIA provides a [client library](./pkg/client-go) for those looking to write applications using the Device API. This library allows you to query and manipulate node-local resources using standard Kubernetes interfaces. Alternatively, the API can be accessed directly via gRPC. +The NVIDIA Device API provides a standardized gRPC interface for observing and managing GPU device states in Kubernetes environments. It enables coordination between: + +- **Providers** (health monitors like NVSentinel, DCGM) that detect GPU health issues +- **Consumers** (device plugins, DRA drivers) that need GPU health status for scheduling + +## Overview + +The Device API Server is a pure Go gRPC server with no hardware dependencies. +GPU enumeration and health monitoring is provided by external providers (sidecars). + +``` +┌─────────────────────────────────────────────────────────────┐ +│ GPU Node │ +│ │ +│ ┌─────────────────────────────────────────────────────────┐│ +│ │ Device API Server (DaemonSet) ││ +│ │ ││ +│ │ ┌─────────────────────────────────────────────────┐ ││ +│ │ │ GpuService (unified) │ ││ +│ │ │ Read: GetGpu, ListGpus, WatchGpus │ ││ +│ │ │ Write: CreateGpu, UpdateGpuStatus, DeleteGpu │ ││ +│ │ └────────────────────┬────────────────────────────┘ ││ +│ │ ▼ ││ +│ │ ┌──────────────────────────────────────────────────┐ ││ +│ │ │ GPU Cache (RWMutex) │ ││ +│ │ └──────────────────────────────────────────────────┘ ││ +│ └─────────────────────────────────────────────────────────┘│ +│ │ +│ Providers (gRPC clients): │ +│ ├── nvml-provider sidecar ─► CreateGpu, UpdateGpuStatus │ +│ ├── NVSentinel ────────────► CreateGpu, UpdateGpuStatus │ +│ └── Custom providers ──────► CreateGpu, UpdateGpuStatus │ +│ │ +│ Consumers (gRPC clients): │ +│ ├── Device Plugins ────────► GetGpu, ListGpus, WatchGpus │ +│ └── DRA Drivers ───────────► GetGpu, ListGpus, WatchGpus │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Key Features + +- **Pure Go server**: No hardware dependencies; providers run as separate sidecars +- **Read-blocking semantics**: Consumer reads block during provider updates to prevent stale data +- **Multiple provider support**: Aggregate health status from NVSentinel, DCGM, or custom providers +- **Watch streams**: Real-time GPU state change notifications +- **Prometheus metrics**: Full observability with alerting rules +- **Helm chart**: Production-ready Kubernetes deployment + +## Repository Structure + +| Module | Description | +| :--- | :--- | +| [`api/`](./api) | Protobuf definitions and Go types for the Device API. | +| [`client-go/`](./client-go) | Kubernetes-style generated clients, informers, and listers. | +| [`code-generator/`](./code-generator) | Tools for generating NVIDIA-specific client logic. | +| [`cmd/device-api-server/`](./cmd/device-api-server) | Device API Server binary | +| [`pkg/deviceapiserver/`](./pkg/deviceapiserver) | Server implementation | +| [`charts/`](./charts) | Helm chart for Kubernetes deployment | --- ## Quick Start +### Deploy Device API Server + +```bash +# Install with Helm +helm install device-api-server ./deployments/helm/device-api-server \ + --namespace device-api --create-namespace +``` + +For GPU enumeration and health monitoring, deploy the nvml-provider sidecar. +See the [nvml-sidecar demo](demos/nvml-sidecar-demo.sh) for an example deployment. + +### Using the Go Client + +```bash +go get github.com/nvidia/device-api/api@latest +``` + ```go import ( - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/nvidia/nvsentinel/pkg/client-go/clientset/versioned" - "github.com/nvidia/nvsentinel/pkg/grpc/client" + v1alpha1 "github.com/nvidia/device-api/api/gen/go/device/v1alpha1" +) +``` + +### Example: List GPUs + +```go +package main + +import ( + "context" + "log" + + v1alpha1 "github.com/nvidia/device-api/api/gen/go/device/v1alpha1" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" ) func main() { - ctx := context.Background() + // Connect via Unix socket (recommended for node-local access) + conn, err := grpc.NewClient( + "unix:///var/run/device-api/device.sock", + grpc.WithTransportCredentials(insecure.NewCredentials()), + ) + if err != nil { + log.Fatalf("failed to connect: %v", err) + } + defer conn.Close() - // Connect to the local node's Device API server - config := &client.Config{Target: "unix:///var/run/nvidia-device-api/device-api.sock"} - clientset := versioned.NewForConfigOrDie(config) + client := v1alpha1.NewGpuServiceClient(conn) - // Standard Kubernetes-style List call - gpus, err := clientset.DeviceV1alpha1().GPUs().List(ctx, metav1.ListOptions{}) + // List all GPUs + resp, err := client.ListGpus(context.Background(), &v1alpha1.ListGpusRequest{}) if err != nil { - panic(err) + log.Fatalf("failed to list GPUs: %v", err) + } + + for _, gpu := range resp.GpuList.Items { + log.Printf("GPU: %s (UUID: %s)", gpu.Name, gpu.Spec.Uuid) + for _, cond := range gpu.Status.Conditions { + log.Printf(" %s: %s (%s)", cond.Type, cond.Status, cond.Reason) + } } } ``` -See [examples](./examples) for additional details. +### Using grpcurl ---- +```bash +# List GPUs +grpcurl -plaintext localhost:50051 nvidia.device.v1alpha1.GpuService/ListGpus + +# Watch for changes +grpcurl -plaintext localhost:50051 nvidia.device.v1alpha1.GpuService/WatchGpus +``` -## Components +## API Overview -### Device API Server -The `device-apiserver` is a node-local control plane for NVIDIA devices. +### GpuService -**Running the server**: -```bash -# Build the binary -make build +The unified `GpuService` follows Kubernetes API conventions with standard CRUD methods: -# Start the server with a local database -./bin/device-apiserver \ - --bind-address="unix:///var/run/nvidia-device-api/device-api.sock" \ - --datastore-endpoint="sqlite:///var/lib/nvidia-device-api/state.db" -``` +**Read Operations** (for consumers like device plugins and DRA drivers): + +| Method | Description | +|--------|-------------| +| `GetGpu` | Retrieves a single GPU resource by its unique name | +| `ListGpus` | Retrieves a list of all GPU resources | +| `WatchGpus` | Streams lifecycle events (ADDED, MODIFIED, DELETED) for GPU resources | + +**Write Operations** (for providers like health monitors): + +| Method | Description | +|--------|-------------| +| `CreateGpu` | Register a new GPU with the server | +| `UpdateGpu` | Replace entire GPU resource | +| `UpdateGpuStatus` | Update GPU status only (acquires write lock) | +| `DeleteGpu` | Remove a GPU from the server | --- @@ -58,29 +171,60 @@ make build ### Prerequisites -* **Go**: `v1.25+` -* **Protoc**: Required for protobuf generation. -* **Make** +- **Go**: `v1.25+` +- **Protoc**: Required for protobuf generation +- **golangci-lint**: Required for code quality checks +- **Make**: Used for orchestrating build and generation tasks +- **Helm 3.0+**: For chart development -### Workflow -The project utilizes a unified generation pipeline. **Avoid editing generated files directly**. If Protobuf definitions (`.proto`) or Go types (`_types.go`) are modified, run the following commands to synchronize the repository: +### Build ```bash -# Sync all gRPC bindings, DeepCopy/Conversion methods, Clients, and Server +# Build everything +make build + +# Build server only +make build-server + +# Generate protobuf code make code-gen +``` -# Run tests +### Test + +```bash +# Run all tests make test -# Verify code quality -make lint +# Run server tests only +make test-server +``` -# Optional: Run integration tests -make test-integration +### Lint + +```bash +make lint ``` --- +## Documentation + +- **[API Reference](docs/api/device-api-server.md)** - Complete gRPC API documentation +- **[Operations Guide](docs/operations/device-api-server.md)** - Deployment, configuration, monitoring +- **[Helm Chart](deployments/helm/device-api-server/README.md)** - Chart configuration reference +- **[Design Documents](docs/design/)** - Architecture and design decisions + +The `client-go` module includes several examples for how to use the generated clients: + +* **Standard Client**: Basic CRUD operations. +* **Shared Informers**: High-performance caching for controllers. +* **Watch**: Real-time event streaming via gRPC. + +See the [examples](./client-go/examples) directory for details. + +--- + ## Contributing We welcome contributions! Please see: diff --git a/api/device/v1alpha1/converter.go b/api/device/v1alpha1/converter.go index ff649f992..14b11b5e0 100644 --- a/api/device/v1alpha1/converter.go +++ b/api/device/v1alpha1/converter.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -54,6 +54,17 @@ type Converter interface { // FromProtobufObjectMeta converts a protobuf ObjectMeta into a metav1.ObjectMeta object. // + // The following fields are intentionally excluded from the proto API: + // - DeletionTimestamp/GracePeriodSeconds: Managed by server-side deletion logic + // - Labels/Annotations: Not needed for device-level proto API; K8s controllers + // should use the native K8s API for label/annotation management + // - OwnerReferences/Finalizers: Not exposed in proto to prevent external + // controllers from creating dependency chains via the device API + // - ManagedFields/SelfLink: Server-managed metadata, not user-facing + // + // If labels/annotations support is needed in the future, add them to the + // proto ObjectMeta definition and remove the goverter:ignore directives. + // // goverter:map Uid UID // goverter:ignore GenerateName DeletionTimestamp DeletionGracePeriodSeconds // goverter:ignore Labels Annotations OwnerReferences Finalizers ManagedFields SelfLink diff --git a/api/device/v1alpha1/gpu_types.go b/api/device/v1alpha1/gpu_types.go index e551b85a9..704bea40e 100644 --- a/api/device/v1alpha1/gpu_types.go +++ b/api/device/v1alpha1/gpu_types.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -48,8 +48,7 @@ type GPUStatus struct { // // +genclient // +genclient:nonNamespaced -// +genclient:onlyVerbs=get,list,watch,create,update,delete -// +genclient:noStatus +// +genclient:onlyVerbs=get,list,watch,create,update,updateStatus,delete // +k8s:deepcopy-gen=true // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object type GPU struct { diff --git a/api/device/v1alpha1/zz_generated.deepcopy.go b/api/device/v1alpha1/zz_generated.deepcopy.go index 0c399eb3e..f5cf44cb4 100644 --- a/api/device/v1alpha1/zz_generated.deepcopy.go +++ b/api/device/v1alpha1/zz_generated.deepcopy.go @@ -1,7 +1,7 @@ //go:build !ignore_autogenerated // +build !ignore_autogenerated -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/api/proto/device/v1alpha1/gpu.proto b/api/proto/device/v1alpha1/gpu.proto index 2641c415e..88577a9c6 100644 --- a/api/proto/device/v1alpha1/gpu.proto +++ b/api/proto/device/v1alpha1/gpu.proto @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -194,6 +194,9 @@ service GpuService { // UpdateGpu updates a single GPU resource. rpc UpdateGpu(UpdateGpuRequest) returns (Gpu); + // UpdateGpuStatus updates only the status subresource of a GPU. + rpc UpdateGpuStatus(UpdateGpuStatusRequest) returns (Gpu); + // DeleteGpu deletes a single GPU resource. rpc DeleteGpu(DeleteGpuRequest) returns (google.protobuf.Empty); } @@ -289,6 +292,18 @@ message UpdateGpuRequest { UpdateOptions opts = 2; } +// UpdateGpuStatusRequest specifies the GPU whose status should be updated. +// Only metadata (name, namespace, resource_version) and status fields are used. +message UpdateGpuStatusRequest { + // gpu is the GPU resource with updated status. + // The server reads metadata.name, metadata.namespace, metadata.resource_version + // and status from this object. All other fields are ignored. + Gpu gpu = 1; + + // opts contains the options for the update. + UpdateOptions opts = 2; +} + message DeleteGpuRequest { // The unique resource name of the GPU to delete. string name = 1; diff --git a/cmd/device-api-server/main.go b/cmd/device-api-server/main.go new file mode 100644 index 000000000..91f61b039 --- /dev/null +++ b/cmd/device-api-server/main.go @@ -0,0 +1,186 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package main implements the Device API Server. +// +// The Device API Server is a node-local gRPC cache server deployed as a +// Kubernetes DaemonSet. It acts as an intermediary between providers +// (health monitors) that update GPU device states and consumers +// (device plugins, DRA drivers) that read device states. +// +// Key features: +// - Read-blocking semantics: Reads are blocked during provider updates +// to prevent consumers from reading stale data +// - Multiple provider support: Multiple health monitors can update +// different conditions on the same GPUs +// - Multiple consumer support: Device plugins, DRA drivers, and other +// consumers can read and watch GPU states +// - Observability: Prometheus metrics, structured logging with klog/v2 +package main + +import ( + "context" + "encoding/json" + "fmt" + "os" + "os/signal" + "syscall" + + "github.com/spf13/pflag" + "golang.org/x/sync/errgroup" + cliflag "k8s.io/component-base/cli/flag" + "k8s.io/klog/v2" + + "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver" + "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/options" + "github.com/nvidia/nvsentinel/pkg/storage/storagebackend" + "github.com/nvidia/nvsentinel/pkg/version" + + // Import service providers so their init() functions register them. + _ "github.com/nvidia/nvsentinel/pkg/services/device/v1alpha1" +) + +const ( + // ComponentName is the name of this component for logging. + ComponentName = "device-api-server" +) + +func main() { + opts := options.NewOptions() + + fss := cliflag.NamedFlagSets{} + opts.AddFlags(&fss) + + // Add a version flag to the global flag set. + showVersion := pflag.Bool("version", false, "Show version and exit") + + // Merge all named flag sets into the global pflag command line. + for _, fs := range fss.FlagSets { + pflag.CommandLine.AddFlagSet(fs) + } + + pflag.Parse() + + // Handle version flag before any other initialization. + if *showVersion { + v := version.Get() + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + if err := enc.Encode(v); err != nil { + fmt.Fprintf(os.Stderr, "Failed to encode version: %v\n", err) + os.Exit(1) + } + os.Exit(0) + } + + // Set up signal handling for graceful shutdown. + ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer cancel() + + // Complete fills in defaults and resolves environment overrides. + completedOpts, err := opts.Complete(ctx) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to complete options: %v\n", err) + os.Exit(1) + } + + // Validate rejects invalid flag combinations. + if errs := completedOpts.Validate(); len(errs) > 0 { + for _, e := range errs { + fmt.Fprintf(os.Stderr, "Invalid configuration: %v\n", e) + } + os.Exit(1) + } + + // Create root logger with component name. + logger := klog.Background().WithName(ComponentName) + ctx = klog.NewContext(ctx, logger) + + versionInfo := version.Get() + logger.Info("Starting server", + "version", versionInfo.Version, + "commit", versionInfo.GitCommit, + "buildDate", versionInfo.BuildDate, + ) + + // Build the apiserver configuration from completed options. + apiserverConfig, err := apiserver.NewConfig(ctx, completedOpts) + if err != nil { + logger.Error(err, "Failed to create apiserver config") + os.Exit(1) + } + + completedAPIServerConfig, err := apiserverConfig.Complete() + if err != nil { + logger.Error(err, "Failed to complete apiserver config") + os.Exit(1) + } + + // Build the storage backend configuration from completed options. + storageConfig, err := storagebackend.NewConfig(ctx, completedOpts.Storage) + if err != nil { + logger.Error(err, "Failed to create storage config") + os.Exit(1) + } + + completedStorageConfig, err := storageConfig.Complete() + if err != nil { + logger.Error(err, "Failed to complete storage config") + os.Exit(1) + } + + storage, err := completedStorageConfig.New() + if err != nil { + logger.Error(err, "Failed to create storage backend") + os.Exit(1) + } + + preparedStorage, err := storage.PrepareRun(ctx) + if err != nil { + logger.Error(err, "Failed to prepare storage backend") + os.Exit(1) + } + + // Create, prepare the device API server before starting the run loop. + server, err := completedAPIServerConfig.New(storage) + if err != nil { + logger.Error(err, "Failed to create device API server") + os.Exit(1) + } + + prepared, err := server.PrepareRun(ctx) + if err != nil { + logger.Error(err, "Failed to prepare device API server") + os.Exit(1) + } + + // Run storage and server concurrently. If either fails, the errgroup + // cancels the shared context so the other component shuts down. + g, gctx := errgroup.WithContext(ctx) + + g.Go(func() error { + return preparedStorage.Run(gctx) + }) + + g.Go(func() error { + return prepared.Run(gctx) + }) + + if err := g.Wait(); err != nil { + logger.Error(err, "Server error") + os.Exit(1) + } + + logger.Info("Server stopped gracefully") +} diff --git a/cmd/device-apiserver/apiserver.go b/cmd/device-apiserver/apiserver.go deleted file mode 100644 index 3d2f8352a..000000000 --- a/cmd/device-apiserver/apiserver.go +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "os" - - "k8s.io/component-base/cli" - - "github.com/nvidia/nvsentinel/cmd/device-apiserver/app" -) - -func main() { - command := app.NewAPIServerCommand() - code := cli.Run(command) - os.Exit(code) -} diff --git a/cmd/device-apiserver/app/config.go b/cmd/device-apiserver/app/config.go deleted file mode 100644 index 520b4c0c2..000000000 --- a/cmd/device-apiserver/app/config.go +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package app - -import ( - "context" - - "github.com/nvidia/nvsentinel/cmd/device-apiserver/app/options" - controlplane "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver" - "github.com/nvidia/nvsentinel/pkg/storage/storagebackend" -) - -type Config struct { - Options options.CompletedOptions - - Storage *storagebackend.Config - APIs *controlplane.Config -} - -type completedConfig struct { - Options options.CompletedOptions - - Storage storagebackend.CompletedConfig - APIs controlplane.CompletedConfig -} - -type CompletedConfig struct { - *completedConfig -} - -func NewConfig(ctx context.Context, opts options.CompletedOptions) (*Config, error) { - c := &Config{ - Options: opts, - } - - storageConfig, err := storagebackend.NewConfig(ctx, opts.Storage) - if err != nil { - return nil, err - } - - c.Storage = storageConfig - - controlPlaneConfig, err := controlplane.NewConfig(ctx, opts.CompletedOptions) - if err != nil { - return nil, err - } - - c.APIs = controlPlaneConfig - - return c, nil -} - -func (c *Config) Complete() (CompletedConfig, error) { - if c == nil || c.Storage == nil || c.APIs == nil { - return CompletedConfig{}, nil - } - - completedStorage, err := c.Storage.Complete() - if err != nil { - return CompletedConfig{}, err - } - - completedAPIs, err := c.APIs.Complete() - if err != nil { - return CompletedConfig{}, err - } - - return CompletedConfig{&completedConfig{ - Options: c.Options, - - Storage: completedStorage, - APIs: completedAPIs, - }}, nil -} diff --git a/cmd/device-apiserver/app/config_test.go b/cmd/device-apiserver/app/config_test.go deleted file mode 100644 index a02d0ec64..000000000 --- a/cmd/device-apiserver/app/config_test.go +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package app - -import ( - "context" - "testing" - - "github.com/nvidia/nvsentinel/cmd/device-apiserver/app/options" -) - -func TestConfig(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - opts := options.NewServerRunOptions() - - completedOpts, err := opts.Complete(ctx) - if err != nil { - t.Fatalf("Failed to complete options: %v", err) - } - - cfg, err := NewConfig(ctx, completedOpts) - if err != nil { - t.Fatalf("NewConfig failed: %v", err) - } - - if cfg.Storage == nil { - t.Error("NewConfig did not initialize Storage config") - } - if cfg.APIs == nil { - t.Error("NewConfig did not initialize APIs config") - } - - t.Run("Complete", func(t *testing.T) { - completedCfg, err := cfg.Complete() - if err != nil { - t.Fatalf("Complete failed: %v", err) - } - - if completedCfg.completedConfig == nil { - t.Fatal("CompletedConfig internal pointer is nil") - } - - validationErrors := completedCfg.Options.Validate() - if len(validationErrors) > 0 { - t.Errorf("CompletedConfig is invalid: %v", validationErrors) - } - }) - - t.Run("NilSafety", func(t *testing.T) { - var nilCfg *Config - _, err := nilCfg.Complete() - if err != nil { - t.Errorf("Complete() on nil config should not return error, got: %v", err) - } - - partialCfg := &Config{} - _, err = partialCfg.Complete() - if err != nil { - t.Errorf("Complete() on empty config should handle nil sub-fields gracefully, got: %v", err) - } - }) -} diff --git a/cmd/device-apiserver/app/main_test.go b/cmd/device-apiserver/app/main_test.go deleted file mode 100644 index b1f6de7de..000000000 --- a/cmd/device-apiserver/app/main_test.go +++ /dev/null @@ -1,11 +0,0 @@ -package app - -import ( - "testing" - - "github.com/nvidia/nvsentinel/pkg/util/testutils" -) - -func TestMain(m *testing.M) { - testutils.VerifyTestMain(m) -} diff --git a/cmd/device-apiserver/app/options/options.go b/cmd/device-apiserver/app/options/options.go deleted file mode 100644 index 498edc89f..000000000 --- a/cmd/device-apiserver/app/options/options.go +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package options - -import ( - "context" - - cp "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/options" - cliflag "k8s.io/component-base/cli/flag" -) - -type ServerRunOptions struct { - *cp.Options -} - -type completedOptions struct { - cp.CompletedOptions -} - -type CompletedOptions struct { - *completedOptions -} - -func NewServerRunOptions() *ServerRunOptions { - return &ServerRunOptions{ - Options: cp.NewOptions(), - } -} - -func (s *ServerRunOptions) Flags() cliflag.NamedFlagSets { - fss := cliflag.NamedFlagSets{} - if s == nil || s.Options == nil { - return fss - } - - s.AddFlags(&fss) - - return fss -} - -func (o *ServerRunOptions) Complete(ctx context.Context) (CompletedOptions, error) { - if o == nil { - return CompletedOptions{completedOptions: &completedOptions{}}, nil - } - - controlplane, err := o.Options.Complete(ctx) - if err != nil { - return CompletedOptions{}, err - } - - completed := completedOptions{ - CompletedOptions: controlplane, - } - - return CompletedOptions{ - completedOptions: &completed, - }, nil -} - -func (o completedOptions) Validate() []error { - errs := o.CompletedOptions.Validate() - - return errs -} diff --git a/cmd/device-apiserver/app/options/options_test.go b/cmd/device-apiserver/app/options/options_test.go deleted file mode 100644 index b81e5ac95..000000000 --- a/cmd/device-apiserver/app/options/options_test.go +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package options - -import ( - "context" - "testing" -) - -func TestServerRunOptions(t *testing.T) { - opts := NewServerRunOptions() - if opts == nil || opts.Options == nil { - t.Fatal("NewServerRunOptions failed to initialize internal options") - } - - fss := opts.Flags() - if len(fss.FlagSets) == 0 { - t.Error("Flags() returned empty NamedFlagSets; expected flags from internal options") - } - - var nilOpts *ServerRunOptions - nilFss := nilOpts.Flags() - if len(nilFss.FlagSets) != 0 { - t.Error("Flags() on nil options should return empty flag sets") - } - - t.Run("CompleteAndValidate", func(t *testing.T) { - ctx := context.Background() - - completed, err := opts.Complete(ctx) - if err != nil { - t.Fatalf("Complete failed: %v", err) - } - - if completed.completedOptions == nil { - t.Fatal("CompletedOptions internal pointer is nil") - } - - errs := completed.Validate() - if len(errs) > 0 { - t.Logf("Note: Default validation returned %d errors (this is expected if defaults require setup)", len(errs)) - } - }) - - t.Run("CompleteNil", func(t *testing.T) { - var nilOpts *ServerRunOptions - completed, err := nilOpts.Complete(context.Background()) - if err != nil { - t.Errorf("Complete() on nil options should not return error, got: %v", err) - } - if completed.completedOptions == nil { - t.Error("Complete() on nil options should return a valid wrapper") - } - }) -} diff --git a/cmd/device-apiserver/app/server.go b/cmd/device-apiserver/app/server.go deleted file mode 100644 index be9165554..000000000 --- a/cmd/device-apiserver/app/server.go +++ /dev/null @@ -1,157 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package app - -import ( - "context" - "os" - - "github.com/nvidia/nvsentinel/cmd/device-apiserver/app/options" - _ "github.com/nvidia/nvsentinel/pkg/services/device/v1alpha1" - "github.com/nvidia/nvsentinel/pkg/util/verflag" - utilversion "github.com/nvidia/nvsentinel/pkg/util/version" - "github.com/spf13/cobra" - "golang.org/x/sync/errgroup" - utilerrors "k8s.io/apimachinery/pkg/util/errors" - genericapiserver "k8s.io/apiserver/pkg/server" - cliflag "k8s.io/component-base/cli/flag" - "k8s.io/component-base/cli/globalflag" - "k8s.io/component-base/logs" - logsapi "k8s.io/component-base/logs/api/v1" - "k8s.io/component-base/term" - "k8s.io/klog/v2" -) - -// NewAPIServerCommand creates a *cobra.Command object with default parameters -func NewAPIServerCommand() *cobra.Command { - s := options.NewServerRunOptions() - ctx := genericapiserver.SetupSignalContext() - - cmd := &cobra.Command{ - Use: "device-apiserver", - Long: `The Device API server validates and configures data -for the api objects which include gpus and others. The API Server services -gRPC operations and provides the frontend to a node's shared state through -which all other node-local components interact.`, - - RunE: func(cmd *cobra.Command, args []string) error { - verflag.PrintAndExitIfRequested() - - fs := cmd.Flags() - // Activate logging as soon as possible, after that - // show flags with the final logging configuration. - logsapi.ReapplyHandling = logsapi.ReapplyHandlingIgnoreUnchanged - if err := logsapi.ValidateAndApply(s.Logs, nil); err != nil { - return err - } - - cliflag.PrintFlags(fs) - - // set default options - completedOptions, err := s.Complete(ctx) - if err != nil { - return err - } - - // validate options - if errs := completedOptions.Validate(); len(errs) != 0 { - return utilerrors.NewAggregate(errs) - } - - return Run(ctx, completedOptions) - }, - Args: cobra.NoArgs, - } - cmd.SetContext(ctx) - - fs := cmd.Flags() - namedFlagSets := s.Flags() - verflag.AddFlags(namedFlagSets.FlagSet("global")) - globalflag.AddGlobalFlags(namedFlagSets.FlagSet("global"), cmd.Name(), logs.SkipLoggingConfigurationFlags()) - - for _, f := range namedFlagSets.FlagSets { - fs.AddFlagSet(f) - } - - cols, _, _ := term.TerminalSize(cmd.OutOrStdout()) - cliflag.SetUsageAndHelpFunc(cmd, namedFlagSets, cols) - - return cmd -} - -// Run runs the specified APIServer. This should never exit. -func Run(ctx context.Context, opts options.CompletedOptions) error { - logger := klog.FromContext(ctx).WithValues("node", opts.NodeName) - ctx = klog.NewContext(ctx, logger) - - logger.Info("Initializing Device API Server", "version", utilversion.Get()) - logger.V(2).Info("Golang settings", - "GOGC", os.Getenv("GOGC"), - "GOMAXPROCS", os.Getenv("GOMAXPROCS"), - "GOTRACEBACK", os.Getenv("GOTRACEBACK"), - ) - - config, err := NewConfig(ctx, opts) - if err != nil { - return err - } - - completed, err := config.Complete() - if err != nil { - return err - } - - // Initialize and prepare storage to be injected into the server for readiness. - storage, err := completed.Storage.New() - if err != nil { - return err - } - - // Inject storage into the server to coordinate startup. - server, err := completed.APIs.New(storage) - if err != nil { - return err - } - - g, ctx := errgroup.WithContext(ctx) - - g.Go(func() error { - preparedStorage, err := storage.PrepareRun(ctx) - if err != nil { - return err - } - - return preparedStorage.Run(ctx) - }) - - g.Go(func() error { - preparedServer, err := server.PrepareRun(ctx) - if err != nil { - return err - } - - return preparedServer.Run(ctx) - }) - - err = g.Wait() - if err != nil { - logger.Error(err, "internal error: Device API Server exited with error") - return err - } - - logger.Info("Device API Server shut down gracefully") - - return nil -} diff --git a/cmd/device-apiserver/app/server_test.go b/cmd/device-apiserver/app/server_test.go deleted file mode 100644 index a81dac2da..000000000 --- a/cmd/device-apiserver/app/server_test.go +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package app - -import ( - "context" - "fmt" - "os" - "path/filepath" - "strings" - "testing" - "time" - - "github.com/nvidia/nvsentinel/cmd/device-apiserver/app/options" - "github.com/nvidia/nvsentinel/pkg/util/testutils" -) - -func TestRun(t *testing.T) { - opts := options.NewServerRunOptions() - - localSocket := testutils.NewUnixAddr(t) - kineSocket := fmt.Sprintf("unix://%s", testutils.NewUnixAddr(t)) - healthAddr := testutils.GetFreeTCPAddress(t) - - opts.GRPC.BindAddress = "unix://" + localSocket - opts.HealthAddress = healthAddr - opts.NodeName = "test-node" - - tmpDir := t.TempDir() - opts.Storage.DatabaseDir = tmpDir - opts.Storage.DatabasePath = tmpDir + "state.db" - opts.Storage.KineSocketPath = kineSocket - opts.Storage.KineConfig.Endpoint = fmt.Sprintf("sqlite://%s/db.sqlite", tmpDir) - opts.Storage.KineConfig.Listener = kineSocket - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - completedOpts, err := opts.Complete(ctx) - if err != nil { - t.Fatalf("Failed to complete options: %v", err) - } - - errCh := make(chan error, 1) - go func() { - errCh <- Run(ctx, completedOpts) - }() - - testutils.WaitForStatus(t, healthAddr, "", 5*time.Second, testutils.IsServing) - - cancel() - - select { - case err := <-errCh: - if err != nil && err != context.Canceled { - t.Errorf("exited with unexpected error: %v", err) - } - case <-time.After(5 * time.Second): - t.Fatal("Failed to shut down within grace period") - } - - if _, err := os.Stat(localSocket); err == nil { - t.Errorf("socket file %q still exists after shutdown", localSocket) - } -} - -func TestRun_StorageFailure(t *testing.T) { - opts := options.NewServerRunOptions() - - tmpDir := t.TempDir() - readOnlyDir := filepath.Join(tmpDir, "readonly") - if err := os.Mkdir(readOnlyDir, 0444); err != nil { - t.Fatal(err) - } - - opts.NodeName = "test-node" - opts.Storage.DatabaseDir = readOnlyDir - opts.Storage.DatabasePath = readOnlyDir + "state.db" - opts.Storage.KineSocketPath = filepath.Join(readOnlyDir, "kine.sock") - opts.Storage.KineConfig.Endpoint = fmt.Sprintf("sqlite://%s/db.sqlite", readOnlyDir) - - opts.HealthAddress = testutils.GetFreeTCPAddress(t) - opts.GRPC.BindAddress = "unix://" + filepath.Join(tmpDir, "api.sock") - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - completedOpts, _ := opts.Complete(ctx) - - errCh := make(chan error, 1) - go func() { - errCh <- Run(ctx, completedOpts) - }() - - select { - case err := <-errCh: - if err == nil { - t.Error("Expected server to fail due to storage error, but it exited with nil") - } - if !strings.Contains(err.Error(), "storage") && !strings.Contains(err.Error(), "permission denied") { - t.Errorf("Expected storage or permission error, got: %v", err) - } - case <-time.After(5 * time.Second): - t.Fatal("Server should have failed immediately on storage error, but it timed out/hung") - } -} diff --git a/cmd/nvml-provider/main.go b/cmd/nvml-provider/main.go new file mode 100644 index 000000000..57ec0f835 --- /dev/null +++ b/cmd/nvml-provider/main.go @@ -0,0 +1,726 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build nvml + +// Command nvml-provider is a standalone NVML-based GPU health provider that +// connects to a device-api-server instance via gRPC. +// +// This is designed to run as a sidecar container alongside device-api-server, +// providing GPU enumeration and health monitoring via NVML. +// +// Usage: +// +// nvml-provider --server-address=localhost:9001 --driver-root=/run/nvidia/driver +package main + +import ( + "context" + "flag" + "fmt" + "net" + "net/http" + "os" + "os/signal" + "strings" + "sync" + "syscall" + "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/health/grpc_health_v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + + devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1" + clientset "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned" + gpuclient "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned/typed/device/v1alpha1" + nvmlpkg "github.com/nvidia/nvsentinel/pkg/providers/nvml" +) + +const ( + // DefaultProviderID is the default identifier for this provider. + DefaultProviderID = "nvml-provider-sidecar" + + // HeartbeatInterval is how often to send heartbeats. + HeartbeatInterval = 10 * time.Second + + // HealthCheckPort is the HTTP port for health checks. + HealthCheckPort = 8082 + + // EventTimeout is the timeout for NVML event wait (in milliseconds). + EventTimeout = 5000 + + // DefaultServerAddress is the default device-api-server address. + DefaultServerAddress = "localhost:9001" + + // ConnectionRetryInterval is how long to wait between connection attempts. + ConnectionRetryInterval = 5 * time.Second + + // MaxConnectionRetries is the maximum number of connection attempts. + MaxConnectionRetries = 60 +) + +// Config holds the provider configuration. +type Config struct { + ServerAddress string + ProviderID string + DriverRoot string + HealthCheckEnabled bool + HealthCheckPort int + IgnoredXids []uint64 +} + +// DefaultConfig returns a Config with sensible defaults. +func DefaultConfig() Config { + return Config{ + ServerAddress: DefaultServerAddress, + ProviderID: DefaultProviderID, + DriverRoot: "/run/nvidia/driver", + HealthCheckEnabled: true, + HealthCheckPort: HealthCheckPort, + } +} + +// Provider is the standalone NVML provider that connects to device-api-server. +type Provider struct { + config Config + logger klog.Logger + + // gRPC clients + conn *grpc.ClientConn + gpuClient gpuclient.GPUInterface + healthClient grpc_health_v1.HealthClient + + // NVML + nvmllib nvml.Interface + eventSet nvml.EventSet + + // State + mu sync.RWMutex + gpuUUIDs []string + initialized bool + connected bool + healthy bool + monitorRunning bool + + // Lifecycle + ctx context.Context + cancel context.CancelFunc + wg sync.WaitGroup +} + +// NewProvider creates a new standalone NVML provider. +func NewProvider(cfg Config, logger klog.Logger) *Provider { + return &Provider{ + config: cfg, + logger: logger.WithName("nvml-provider"), + } +} + +func main() { + // Initialize logging flags first + klog.InitFlags(nil) + + cfg := parseFlags() + // flag.Parse() is called inside parseFlags() + + logger := klog.Background() + logger.Info("Starting NVML provider sidecar", + "serverAddress", cfg.ServerAddress, + "providerID", cfg.ProviderID, + "driverRoot", cfg.DriverRoot, + "healthCheckEnabled", cfg.HealthCheckEnabled, + ) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Handle signals + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) + go func() { + sig := <-sigCh + logger.Info("Received signal, shutting down", "signal", sig) + cancel() + }() + + // Create and run provider + provider := NewProvider(cfg, logger) + if err := provider.Run(ctx); err != nil { + logger.Error(err, "Provider failed") + os.Exit(1) + } + + logger.Info("NVML provider shutdown complete") +} + +func parseFlags() Config { + cfg := DefaultConfig() + + flag.StringVar(&cfg.ServerAddress, "server-address", cfg.ServerAddress, + "Address of device-api-server gRPC endpoint") + flag.StringVar(&cfg.ProviderID, "provider-id", cfg.ProviderID, + "Unique identifier for this provider") + flag.StringVar(&cfg.DriverRoot, "driver-root", cfg.DriverRoot, + "Root path for NVIDIA driver libraries") + flag.BoolVar(&cfg.HealthCheckEnabled, "health-check", cfg.HealthCheckEnabled, + "Enable XID event monitoring for health checks") + flag.IntVar(&cfg.HealthCheckPort, "health-port", cfg.HealthCheckPort, + "HTTP port for health check endpoints") + + // Parse flags + flag.Parse() + + // Track which flags were explicitly set on the command line. + explicitFlags := make(map[string]bool) + flag.Visit(func(f *flag.Flag) { + explicitFlags[f.Name] = true + }) + + // Environment variables are used as fallback when the corresponding + // flag was not explicitly provided on the command line. + if !explicitFlags["server-address"] { + if addr := os.Getenv("PROVIDER_SERVER_ADDRESS"); addr != "" { + cfg.ServerAddress = addr + } + } + if !explicitFlags["provider-id"] { + if id := os.Getenv("PROVIDER_ID"); id != "" { + cfg.ProviderID = id + } + } + if !explicitFlags["driver-root"] { + // NVIDIA_DRIVER_ROOT follows the NVIDIA Container Toolkit convention. + // See: https://github.com/NVIDIA/nvidia-container-toolkit + if root := os.Getenv("NVIDIA_DRIVER_ROOT"); root != "" { + cfg.DriverRoot = root + } + } + + return cfg +} + +// Run starts the provider and blocks until the context is cancelled. +func (p *Provider) Run(ctx context.Context) error { + p.ctx, p.cancel = context.WithCancel(ctx) + defer p.cancel() + + // Start health check server + p.wg.Add(1) + go p.runHealthServer() + + // Initialize NVML + if err := p.initNVML(); err != nil { + return fmt.Errorf("failed to initialize NVML: %w", err) + } + defer p.shutdownNVML() + + // Connect to server with retry + if err := p.connectWithRetry(); err != nil { + return fmt.Errorf("failed to connect to server: %w", err) + } + defer p.disconnect() + + // Enumerate and register GPUs (or reconcile if reconnecting) + if err := p.enumerateAndRegisterGPUs(); err != nil { + return fmt.Errorf("failed to enumerate GPUs: %w", err) + } + + // Reconcile state (handles restart/reconnection scenarios) + if err := p.ReconcileState(p.ctx); err != nil { + // Reconciliation failure is not fatal - log and continue + p.logger.Error(err, "State reconciliation failed, continuing") + } + + // Start heartbeat loop + p.wg.Add(1) + go p.runHeartbeatLoop() + + // Start health monitoring if enabled + if p.config.HealthCheckEnabled && len(p.gpuUUIDs) > 0 { + p.wg.Add(1) + go p.runHealthMonitor() + } + + // Mark as healthy + p.setHealthy(true) + + // Wait for shutdown + <-p.ctx.Done() + + // Graceful shutdown + p.setHealthy(false) + p.wg.Wait() + + return nil +} + +// initNVML initializes the NVML library. +func (p *Provider) initNVML() error { + // Find NVML library + libraryPath := nvmlpkg.FindDriverLibrary(p.config.DriverRoot) + if libraryPath != "" { + p.logger.V(2).Info("Using NVML library", "path", libraryPath) + p.nvmllib = nvml.New(nvml.WithLibraryPath(libraryPath)) + } else { + p.logger.V(2).Info("Using system default NVML library") + p.nvmllib = nvml.New() + } + + // Initialize + ret := p.nvmllib.Init() + if ret != nvml.SUCCESS { + return fmt.Errorf("NVML init failed: %v", nvml.ErrorString(ret)) + } + + // Log driver version + if version, ret := p.nvmllib.SystemGetDriverVersion(); ret == nvml.SUCCESS { + p.logger.Info("NVML initialized", "driverVersion", version) + } + + p.initialized = true + return nil +} + +// shutdownNVML shuts down the NVML library. +func (p *Provider) shutdownNVML() { + if !p.initialized { + return + } + + if p.eventSet != nil { + p.eventSet.Free() + p.eventSet = nil + } + + p.nvmllib.Shutdown() + p.initialized = false + p.logger.V(1).Info("NVML shutdown complete") +} + + +// isLocalhostAddress returns true if the address refers to the local machine. +func isLocalhostAddress(addr string) bool { + // Unix socket paths are inherently local. + if strings.HasPrefix(addr, "unix://") || strings.HasPrefix(addr, "/") { + return true + } + host := addr + if h, _, err := net.SplitHostPort(addr); err == nil { + host = h + } + return host == "localhost" || host == "127.0.0.1" || host == "::1" || host == "" +} + +// connectWithRetry connects to the device-api-server with retry logic. +func (p *Provider) connectWithRetry() error { + // Validate that ServerAddress is localhost when using insecure credentials. + // This prevents accidental exposure of unencrypted gRPC traffic over the network. + if !isLocalhostAddress(p.config.ServerAddress) { + return fmt.Errorf("insecure credentials require localhost address, got %q; "+ + "set --server-address to localhost: or use TLS", p.config.ServerAddress) + } + + var lastErr error + + for i := 0; i < MaxConnectionRetries; i++ { + select { + case <-p.ctx.Done(): + return p.ctx.Err() + default: + } + + // Insecure credentials are acceptable here: the provider connects to + // device-api-server via localhost within the same pod (sidecar pattern). + conn, err := grpc.NewClient( + p.config.ServerAddress, + grpc.WithTransportCredentials(insecure.NewCredentials()), + ) + if err != nil { + lastErr = err + p.logger.V(1).Info("Connection attempt failed, retrying", + "attempt", i+1, + "error", err, + ) + time.Sleep(ConnectionRetryInterval) + continue + } + + p.conn = conn + cs := clientset.New(conn) + p.gpuClient = cs.DeviceV1alpha1().GPUs() + p.healthClient = grpc_health_v1.NewHealthClient(conn) + + // Wait for server to be ready + if err := p.waitForServerReady(); err != nil { + conn.Close() + lastErr = err + p.logger.V(1).Info("Server not ready, retrying", + "attempt", i+1, + "error", err, + ) + time.Sleep(ConnectionRetryInterval) + continue + } + + p.connected = true + p.logger.Info("Connected to device-api-server", "address", p.config.ServerAddress) + return nil + } + + return fmt.Errorf("failed to connect after %d attempts: %w", MaxConnectionRetries, lastErr) +} + +// waitForServerReady waits for the server to report healthy. +func (p *Provider) waitForServerReady() error { + ctx, cancel := context.WithTimeout(p.ctx, 5*time.Second) + defer cancel() + + resp, err := p.healthClient.Check(ctx, &grpc_health_v1.HealthCheckRequest{}) + if err != nil { + return fmt.Errorf("health check failed: %w", err) + } + + if resp.Status != grpc_health_v1.HealthCheckResponse_SERVING { + return fmt.Errorf("server not serving: %v", resp.Status) + } + + return nil +} + +// disconnect closes the gRPC connection. +func (p *Provider) disconnect() { + if p.conn != nil { + p.conn.Close() + p.conn = nil + } + p.connected = false +} + +// enumerateAndRegisterGPUs discovers GPUs via NVML and registers them. +func (p *Provider) enumerateAndRegisterGPUs() error { + count, ret := p.nvmllib.DeviceGetCount() + if ret != nvml.SUCCESS { + return fmt.Errorf("failed to get device count: %v", nvml.ErrorString(ret)) + } + + if count == 0 { + p.logger.Info("No GPUs found on this node") + return nil + } + + p.logger.Info("Enumerating GPUs", "count", count) + uuids := make([]string, 0, count) + + for i := 0; i < count; i++ { + device, ret := p.nvmllib.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + p.logger.Error(nil, "Failed to get device handle", "index", i, "error", nvml.ErrorString(ret)) + continue + } + + uuid, ret := device.GetUUID() + if ret != nvml.SUCCESS { + p.logger.Error(nil, "Failed to get device UUID", "index", i, "error", nvml.ErrorString(ret)) + continue + } + + // Get device info for registration + productName, _ := device.GetName() + var memoryBytes uint64 + if memInfo, ret := device.GetMemoryInfo(); ret == nvml.SUCCESS { + memoryBytes = memInfo.Total + } + + // Register GPU with server + if err := p.registerGPU(uuid, productName, memoryBytes); err != nil { + p.logger.Error(err, "Failed to register GPU", "uuid", uuid) + continue + } + + uuids = append(uuids, uuid) + p.logger.Info("Registered GPU", + "uuid", uuid, + "productName", productName, + "memory", nvmlpkg.FormatBytes(memoryBytes), + ) + } + + p.mu.Lock() + p.gpuUUIDs = uuids + p.mu.Unlock() + + p.logger.Info("GPU enumeration complete", "registered", len(uuids)) + return nil +} + +// registerGPU registers a single GPU with the device-api-server using Create. +func (p *Provider) registerGPU(uuid, productName string, memoryBytes uint64) error { + ctx, cancel := context.WithTimeout(p.ctx, 5*time.Second) + defer cancel() + + gpu := &devicev1alpha1.GPU{ + ObjectMeta: metav1.ObjectMeta{Name: uuid}, + Spec: devicev1alpha1.GPUSpec{UUID: uuid}, + Status: devicev1alpha1.GPUStatus{ + Conditions: []metav1.Condition{ + { + Type: nvmlpkg.ConditionTypeNVMLReady, + Status: metav1.ConditionStatus(nvmlpkg.ConditionStatusTrue), + Reason: "Initialized", + Message: fmt.Sprintf("GPU enumerated via NVML: %s (%s)", productName, nvmlpkg.FormatBytes(memoryBytes)), + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + _, err := p.gpuClient.Create(ctx, gpu, metav1.CreateOptions{}) + return err +} + +// runHeartbeatLoop sends periodic heartbeats to the server. +func (p *Provider) runHeartbeatLoop() { + defer p.wg.Done() + + ticker := time.NewTicker(HeartbeatInterval) + defer ticker.Stop() + + for { + select { + case <-p.ctx.Done(): + return + case <-ticker.C: + if err := p.sendHeartbeat(); err != nil { + p.logger.Error(err, "Failed to send heartbeat") + } + } + } +} + +// sendHeartbeat performs a health check on the server connection. +// Note: The Heartbeat RPC was removed. We now just verify the server is reachable. +func (p *Provider) sendHeartbeat() error { + ctx, cancel := context.WithTimeout(p.ctx, 5*time.Second) + defer cancel() + + // Verify server connectivity by checking gRPC health + resp, err := p.healthClient.Check(ctx, &grpc_health_v1.HealthCheckRequest{}) + if err != nil { + return err + } + + if resp.Status != grpc_health_v1.HealthCheckResponse_SERVING { + return fmt.Errorf("server not serving: %v", resp.Status) + } + + p.mu.RLock() + gpuCount := len(p.gpuUUIDs) + p.mu.RUnlock() + + p.logger.V(4).Info("Health check passed", "gpuCount", gpuCount) + return nil +} + +// runHealthMonitor monitors NVML events for GPU health changes. +func (p *Provider) runHealthMonitor() { + defer p.wg.Done() + + p.mu.Lock() + p.monitorRunning = true + p.mu.Unlock() + + defer func() { + p.mu.Lock() + p.monitorRunning = false + p.mu.Unlock() + }() + + // Create event set + eventSet, ret := p.nvmllib.EventSetCreate() + if ret != nvml.SUCCESS { + p.logger.Error(nil, "Failed to create event set", "error", nvml.ErrorString(ret)) + return + } + defer eventSet.Free() + p.eventSet = eventSet + + // Register devices for XID events + deviceCount, ret := p.nvmllib.DeviceGetCount() + if ret != nvml.SUCCESS { + p.logger.Error(nil, "Failed to get device count", "error", nvml.ErrorString(ret)) + return + } + + for i := 0; i < deviceCount; i++ { + device, ret := p.nvmllib.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + continue + } + ret = device.RegisterEvents(nvml.EventTypeXidCriticalError|nvml.EventTypeSingleBitEccError|nvml.EventTypeDoubleBitEccError, eventSet) + if ret != nvml.SUCCESS { + p.logger.V(1).Info("Failed to register events for device", "index", i, "error", nvml.ErrorString(ret)) + } + } + + p.logger.Info("Health monitor started") + + // Event loop + for { + select { + case <-p.ctx.Done(): + return + default: + } + + data, ret := eventSet.Wait(EventTimeout) + if ret == nvml.ERROR_TIMEOUT { + continue + } + if ret != nvml.SUCCESS { + p.logger.V(1).Info("Event wait error", "error", nvml.ErrorString(ret)) + continue + } + + p.handleXIDEvent(data) + } +} + +// handleXIDEvent processes an XID error event. +func (p *Provider) handleXIDEvent(data nvml.EventData) { + if data.Device == nil { + p.logger.Error(nil, "Received XID event with nil device handle") + return + } + + uuid, ret := data.Device.GetUUID() + if ret != nvml.SUCCESS { + p.logger.Error(nil, "Failed to get device UUID from event") + return + } + + xid := data.EventData + p.logger.Info("XID event received", + "uuid", uuid, + "xid", xid, + "eventType", data.EventType, + ) + + // Skip ignored XIDs (application-level errors, not hardware failures). + // This matches the in-process provider behavior in pkg/providers/nvml/health_monitor.go. + if nvmlpkg.IsDefaultIgnored(xid) { + p.logger.V(2).Info("Ignoring non-critical XID", + "uuid", uuid, + "xid", xid, + ) + return + } + + // Only critical XIDs trigger a health state change. + // Non-critical, non-ignored XIDs are logged but do not update GPU status, + // matching the in-process provider behavior in pkg/providers/nvml/health_monitor.go. + if !nvmlpkg.IsCriticalXid(xid) { + p.logger.V(2).Info("Non-critical XID, skipping status update", + "uuid", uuid, + "xid", xid, + ) + return + } + + p.logger.Info("Critical XID error detected", + "uuid", uuid, + "xid", xid, + ) + + ctx, cancel := context.WithTimeout(p.ctx, 5*time.Second) + defer cancel() + + gpu := &devicev1alpha1.GPU{ + ObjectMeta: metav1.ObjectMeta{Name: uuid}, + Status: devicev1alpha1.GPUStatus{ + Conditions: []metav1.Condition{ + { + Type: nvmlpkg.ConditionTypeNVMLReady, + Status: metav1.ConditionStatus(nvmlpkg.ConditionStatusFalse), + Reason: "XIDError", + Message: fmt.Sprintf("Critical XID error: %d", xid), + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + if _, err := p.gpuClient.UpdateStatus(ctx, gpu, metav1.UpdateOptions{}); err != nil { + p.logger.Error(err, "Failed to update GPU status", "uuid", uuid) + } +} + +// runHealthServer runs the HTTP health check server. +func (p *Provider) runHealthServer() { + defer p.wg.Done() + + mux := http.NewServeMux() + mux.HandleFunc("/healthz", p.handleHealthz) + mux.HandleFunc("/readyz", p.handleReadyz) + mux.HandleFunc("/livez", p.handleHealthz) + + server := &http.Server{ + Addr: fmt.Sprintf(":%d", p.config.HealthCheckPort), + Handler: mux, + ReadHeaderTimeout: 5 * time.Second, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + } + + go func() { + <-p.ctx.Done() + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + server.Shutdown(ctx) + }() + + p.logger.Info("Health server started", "port", p.config.HealthCheckPort) + if err := server.ListenAndServe(); err != http.ErrServerClosed { + p.logger.Error(err, "Health server error") + } +} + +func (p *Provider) handleHealthz(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte("ok\n")) +} + +func (p *Provider) handleReadyz(w http.ResponseWriter, _ *http.Request) { + p.mu.RLock() + healthy := p.healthy + p.mu.RUnlock() + + if healthy { + w.WriteHeader(http.StatusOK) + w.Write([]byte("ok\n")) + } else { + w.WriteHeader(http.StatusServiceUnavailable) + w.Write([]byte("not ready\n")) + } +} + +func (p *Provider) setHealthy(healthy bool) { + p.mu.Lock() + p.healthy = healthy + p.mu.Unlock() +} + diff --git a/cmd/nvml-provider/reconciler.go b/cmd/nvml-provider/reconciler.go new file mode 100644 index 000000000..af5f68b6c --- /dev/null +++ b/cmd/nvml-provider/reconciler.go @@ -0,0 +1,308 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build nvml + +package main + +import ( + "context" + "fmt" + "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1" + nvmlpkg "github.com/nvidia/nvsentinel/pkg/providers/nvml" +) + +// ReconcileState reconciles the provider's state with the device-api-server. +// +// This is called on startup and after reconnection to ensure: +// 1. GPUs that were removed while disconnected are unregistered +// 2. GPUs that were added while disconnected are registered +// 3. GPU health states are reconciled with current NVML state +// +// This handles scenarios like: +// - Provider crash and restart +// - Network partition recovery +// - GPU hotplug/removal during provider downtime +func (p *Provider) ReconcileState(ctx context.Context) error { + p.logger.Info("Starting state reconciliation") + + // Step 1: Get current state from server + cachedGPUs, err := p.listCachedGPUs(ctx) + if err != nil { + return fmt.Errorf("failed to list cached GPUs: %w", err) + } + + p.logger.V(1).Info("Retrieved cached GPU state", "count", len(cachedGPUs)) + + // Step 2: Get current GPU UUIDs from NVML + currentUUIDs, err := p.getCurrentGPUUUIDs() + if err != nil { + return fmt.Errorf("failed to get current GPU UUIDs: %w", err) + } + + p.logger.V(1).Info("Current GPUs from NVML", "count", len(currentUUIDs)) + + // Build lookup maps + cachedUUIDSet := make(map[string]*devicev1alpha1.GPU) + for i := range cachedGPUs { + gpu := &cachedGPUs[i] + cachedUUIDSet[gpu.Spec.UUID] = gpu + } + + currentUUIDSet := make(map[string]bool) + for _, uuid := range currentUUIDs { + currentUUIDSet[uuid] = true + } + + // Step 3: Find and unregister removed GPUs + for uuid := range cachedUUIDSet { + if !currentUUIDSet[uuid] { + p.logger.Info("GPU was removed, unregistering", "uuid", uuid) + if err := p.unregisterGPU(ctx, uuid); err != nil { + p.logger.Error(err, "Failed to unregister removed GPU", "uuid", uuid) + // Continue with other GPUs + } + } + } + + // Step 4: Find and register new GPUs + for _, uuid := range currentUUIDs { + if _, exists := cachedUUIDSet[uuid]; !exists { + p.logger.Info("New GPU found, registering", "uuid", uuid) + if err := p.registerNewGPU(ctx, uuid); err != nil { + p.logger.Error(err, "Failed to register new GPU", "uuid", uuid) + // Continue with other GPUs + } + } + } + + // Step 5: Reconcile health state for existing GPUs + for _, uuid := range currentUUIDs { + if cachedGPU, exists := cachedUUIDSet[uuid]; exists { + if err := p.reconcileGPUHealth(ctx, uuid, cachedGPU); err != nil { + p.logger.Error(err, "Failed to reconcile GPU health", "uuid", uuid) + // Continue with other GPUs + } + } + } + + // Step 6: Update local GPU list + p.mu.Lock() + p.gpuUUIDs = currentUUIDs + p.mu.Unlock() + + p.logger.Info("State reconciliation complete", + "totalGPUs", len(currentUUIDs), + ) + + return nil +} + +// listCachedGPUs retrieves the list of GPUs from the server cache. +// +// Note: This lists ALL GPUs, not just those from this provider. +// TODO: Add provider_id filtering to ListGpus RPC for efficiency. +func (p *Provider) listCachedGPUs(ctx context.Context) ([]devicev1alpha1.GPU, error) { + // Note: If the parent context has a shorter deadline, WithTimeout + // inherits the parent's deadline. This is the correct behavior: + // reconciliation should respect the overall operation timeout. + ctx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + gpuList, err := p.gpuClient.List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, err + } + + // Filter to only GPUs that might belong to this provider + // For now, we assume all GPUs belong to us since we're the only provider + // A more robust solution would use provider_id filtering + return gpuList.Items, nil +} + +// getCurrentGPUUUIDs gets the list of GPU UUIDs currently visible to NVML. +func (p *Provider) getCurrentGPUUUIDs() ([]string, error) { + count, ret := p.nvmllib.DeviceGetCount() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get device count: %v", nvml.ErrorString(ret)) + } + + uuids := make([]string, 0, count) + for i := 0; i < count; i++ { + device, ret := p.nvmllib.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + continue + } + + uuid, ret := device.GetUUID() + if ret != nvml.SUCCESS { + continue + } + + uuids = append(uuids, uuid) + } + + return uuids, nil +} + +// unregisterGPU removes a GPU from the server using Delete. +func (p *Provider) unregisterGPU(ctx context.Context, uuid string) error { + // Note: If the parent context has a shorter deadline, WithTimeout + // inherits the parent's deadline. This is the correct behavior: + // reconciliation should respect the overall operation timeout. + ctx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + return p.gpuClient.Delete(ctx, uuid, metav1.DeleteOptions{}) +} + +// registerNewGPU registers a newly discovered GPU. +func (p *Provider) registerNewGPU(ctx context.Context, uuid string) error { + // Get device info from NVML + productName := "Unknown" + var memoryBytes uint64 + + // Find the device by UUID + count, ret := p.nvmllib.DeviceGetCount() + if ret == nvml.SUCCESS { + for i := 0; i < count; i++ { + device, ret := p.nvmllib.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + continue + } + deviceUUID, ret := device.GetUUID() + if ret != nvml.SUCCESS || deviceUUID != uuid { + continue + } + + // Found the device + if name, ret := device.GetName(); ret == nvml.SUCCESS { + productName = name + } + if memInfo, ret := device.GetMemoryInfo(); ret == nvml.SUCCESS { + memoryBytes = memInfo.Total + } + break + } + } + + return p.registerGPU(uuid, productName, memoryBytes) +} + +// reconcileGPUHealth compares cached health state with current NVML state. +// +// If the GPU was marked as Unknown (due to provider timeout) but is now +// healthy per NVML, we update it back to healthy. +func (p *Provider) reconcileGPUHealth(ctx context.Context, uuid string, cachedGPU *devicev1alpha1.GPU) error { + // Check if the cached state shows Unknown (from heartbeat timeout) + var cachedCondition *metav1.Condition + for i := range cachedGPU.Status.Conditions { + cond := &cachedGPU.Status.Conditions[i] + if cond.Type == "Ready" || cond.Type == nvmlpkg.ConditionTypeNVMLReady { + cachedCondition = cond + break + } + } + + // If the condition is Unknown, query NVML and update if healthy + if cachedCondition != nil && string(cachedCondition.Status) == nvmlpkg.ConditionStatusUnknown { + p.logger.Info("GPU has Unknown status, checking current NVML state", "uuid", uuid) + + // For now, if we can enumerate the GPU via NVML, consider it healthy + // A more sophisticated check would query specific health indicators + healthy, err := p.isGPUHealthy(uuid) + if err != nil { + return fmt.Errorf("failed to check GPU health: %w", err) + } + + if healthy { + p.logger.Info("GPU is healthy per NVML, updating status", "uuid", uuid) + return p.updateGPUCondition(ctx, uuid, nvmlpkg.ConditionStatusTrue, "Recovered", "GPU recovered after provider reconnection") + } + } + + return nil +} + +// isGPUHealthy checks if a GPU is healthy via NVML. +func (p *Provider) isGPUHealthy(uuid string) (bool, error) { + // Find device by UUID + count, ret := p.nvmllib.DeviceGetCount() + if ret != nvml.SUCCESS { + return false, fmt.Errorf("failed to get device count: %v", nvml.ErrorString(ret)) + } + + for i := 0; i < count; i++ { + device, ret := p.nvmllib.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + continue + } + deviceUUID, ret := device.GetUUID() + if ret != nvml.SUCCESS || deviceUUID != uuid { + continue + } + + // Device found - check basic health indicators + // 1. Can we get memory info? (basic liveness check) + if _, ret := device.GetMemoryInfo(); ret != nvml.SUCCESS { + return false, nil + } + + // 2. Check for pending page retirements (ECC errors) + if pending, ret := device.GetRetiredPagesPendingStatus(); ret == nvml.SUCCESS { + if pending == nvml.FEATURE_ENABLED { + p.logger.V(1).Info("GPU has pending page retirements", "uuid", uuid) + return false, nil + } + } + + // Device is accessible and no pending issues + return true, nil + } + + // Device not found - not healthy + return false, nil +} + +// updateGPUCondition updates a GPU's status via UpdateStatus. +func (p *Provider) updateGPUCondition(ctx context.Context, uuid, status, reason, message string) error { + // Note: If the parent context has a shorter deadline, WithTimeout + // inherits the parent's deadline. This is the correct behavior: + // reconciliation should respect the overall operation timeout. + ctx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + gpu := &devicev1alpha1.GPU{ + ObjectMeta: metav1.ObjectMeta{Name: uuid}, + Status: devicev1alpha1.GPUStatus{ + Conditions: []metav1.Condition{ + { + Type: nvmlpkg.ConditionTypeNVMLReady, + Status: metav1.ConditionStatus(status), + Reason: reason, + Message: message, + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + _, err := p.gpuClient.UpdateStatus(ctx, gpu, metav1.UpdateOptions{}) + return err +} diff --git a/code-generator/cmd/client-gen/generators/generator_for_type.go b/code-generator/cmd/client-gen/generators/generator_for_type.go index dc4a11bef..028a65658 100644 --- a/code-generator/cmd/client-gen/generators/generator_for_type.go +++ b/code-generator/cmd/client-gen/generators/generator_for_type.go @@ -15,7 +15,7 @@ limitations under the License. */ /* -Portions Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved. +Portions Copyright (c) 2026 NVIDIA CORPORATION. All rights reserved. Modified from the original to support gRPC transport. Origin: https://github.com/kubernetes/code-generator/blob/v0.34.1/cmd/client-gen/generators/generator_for_type.go @@ -401,9 +401,24 @@ func (c *$.type|allLowercasePlural$) Update(ctx $.context|raw$, $.type|allLowerc ` var updateStatusTemplate = ` -// TODO: Implement UpdateStatus support. +// UpdateStatus updates only the status subresource of a $.type|public$. func (c *$.type|allLowercasePlural$) UpdateStatus(ctx $.context|raw$, $.type|allLowercase$ *$.type|raw$, opts $.UpdateOptions|raw$) (*$.type|raw$, error) { - return nil, $.fmtErrorf|raw$("UpdateStatus not implemented") + resp, err := c.client.Update$.ProtoType$Status(ctx, &$.pb$.Update$.ProtoType$StatusRequest{ + $.ProtoType$: $.ToProto|raw$($.type|allLowercase$), + Opts: &$.pb$.UpdateOptions{}, + }) + if err != nil { + return nil, err + } + + obj := $.FromProto|raw$(resp) + c.logger.V(2).Info("Updated $.type|public$ status", + "name", obj.GetName(), + "namespace", c.getNamespace(), + "resource-version", obj.GetResourceVersion(), + ) + + return obj, nil } ` diff --git a/demos/nvml-sidecar-demo.sh b/demos/nvml-sidecar-demo.sh new file mode 100755 index 000000000..cb5ffe06d --- /dev/null +++ b/demos/nvml-sidecar-demo.sh @@ -0,0 +1,752 @@ +#!/bin/bash +# NVML Provider Sidecar Demo +# Demonstrates the NVML provider sidecar architecture for GPU enumeration +# +# Prerequisites: +# - kubectl configured with GPU cluster access +# - docker with buildx for building images +# - helm 3.x installed +# - GPU nodes with RuntimeClass 'nvidia' +# +# Usage: ./demos/nvml-sidecar-demo.sh [kubeconfig] +# +# Environment Variables (all optional): +# KUBECONFIG - Path to kubeconfig file (default: $HOME/.kube/config) +# NAMESPACE - Kubernetes namespace (default: device-api) +# RELEASE_NAME - Helm release name (default: device-api-server) +# IMAGE_REGISTRY - Container registry (default: ttl.sh) +# IMAGE_TAG - Image tag (default: 2h for ttl.sh expiry) +# SERVER_IMAGE - Full device-api-server image (default: $IMAGE_REGISTRY/device-api-server:$IMAGE_TAG) +# SIDECAR_IMAGE - Full sidecar image (default: $IMAGE_REGISTRY/device-api-server-sidecar:$IMAGE_TAG) +# BUILD_PLATFORM - Target platform for builds (default: linux/amd64) +# GPU_NODE_SELECTOR - Label selector for GPU nodes (default: nvidia.com/gpu.present=true) +# CHART_PATH - Path to Helm chart (default: deployments/helm/device-api-server) +# VALUES_FILE - Path to values file (default: deployments/helm/values-sidecar-test.yaml) +# DOCKERFILE - Path to Dockerfile (default: deployments/container/Dockerfile) +# APP_NAME - Helm chart app name for pod selectors (default: device-api-server) +# CONTAINER_NAME - Main container name (default: device-api-server) +# SIDECAR_CONTAINER_NAME - Sidecar container name (default: nvml-provider) +# INTERACTIVE - Enable interactive mode with prompts (default: true) +# SKIP_DESTRUCTIVE - Skip destructive ops in non-interactive mode (default: true) +# SKIP_BUILD - Skip image building entirely (default: false) +# +# Examples: +# # Use default settings with ttl.sh +# ./demos/nvml-sidecar-demo.sh +# +# # Use custom kubeconfig +# KUBECONFIG=~/.kube/config-aws-gpu ./demos/nvml-sidecar-demo.sh +# +# # Use custom registry +# IMAGE_REGISTRY=ghcr.io/nvidia IMAGE_TAG=latest ./demos/nvml-sidecar-demo.sh +# +# # Non-interactive mode (for CI/automation) +# INTERACTIVE=false KUBECONFIG=~/.kube/config ./demos/nvml-sidecar-demo.sh + +set -euo pipefail + +# ============================================================================== +# Configuration (all values configurable via environment variables) +# ============================================================================== + +# Kubernetes configuration +KUBECONFIG="${KUBECONFIG:-${1:-$HOME/.kube/config}}" +NAMESPACE="${NAMESPACE:-device-api}" +RELEASE_NAME="${RELEASE_NAME:-device-api-server}" + +# Paths (relative to repo root) +CHART_PATH="${CHART_PATH:-deployments/helm/device-api-server}" +VALUES_FILE="${VALUES_FILE:-deployments/helm/values-sidecar-test.yaml}" +DOCKERFILE="${DOCKERFILE:-deployments/container/Dockerfile}" + +# Image registry settings +IMAGE_REGISTRY="${IMAGE_REGISTRY:-ttl.sh}" +IMAGE_TAG="${IMAGE_TAG:-2h}" + +# Image names (using ttl.sh ephemeral registry by default - images expire based on tag) +SERVER_IMAGE="${SERVER_IMAGE:-${IMAGE_REGISTRY}/device-api-server:${IMAGE_TAG}}" +SIDECAR_IMAGE="${SIDECAR_IMAGE:-${IMAGE_REGISTRY}/device-api-server-sidecar:${IMAGE_TAG}}" + +# Build settings +BUILD_PLATFORM="${BUILD_PLATFORM:-linux/amd64}" + +# Node selection (for listing GPU nodes) +GPU_NODE_SELECTOR="${GPU_NODE_SELECTOR:-nvidia.com/gpu.present=true}" + +# Interactive mode (set to false for CI/automated runs) +INTERACTIVE="${INTERACTIVE:-true}" + +# Skip destructive demos in non-interactive mode +SKIP_DESTRUCTIVE="${SKIP_DESTRUCTIVE:-true}" + +# Skip image building entirely (use pre-built images) +SKIP_BUILD="${SKIP_BUILD:-false}" + +# Helm chart app name (used for pod selectors and container names) +APP_NAME="${APP_NAME:-device-api-server}" +CONTAINER_NAME="${CONTAINER_NAME:-device-api-server}" +SIDECAR_CONTAINER_NAME="${SIDECAR_CONTAINER_NAME:-nvml-provider}" + +# ============================================================================== +# Terminal Colors (buildah-style) +# ============================================================================== + +if [[ -t 1 ]]; then + red=$(tput setaf 1) + green=$(tput setaf 2) + yellow=$(tput setaf 3) + blue=$(tput setaf 4) + magenta=$(tput setaf 5) + cyan=$(tput setaf 6) + white=$(tput setaf 7) + bold=$(tput bold) + reset=$(tput sgr0) +else + red="" + green="" + yellow="" + blue="" + magenta="" + cyan="" + white="" + bold="" + reset="" +fi + +# ============================================================================== +# Helper Functions +# ============================================================================== + +banner() { + echo "" + echo "${bold}${blue}============================================================${reset}" + echo "${bold}${blue} $1${reset}" + echo "${bold}${blue}============================================================${reset}" + echo "" +} + +step() { + echo "" + echo "${bold}${green}>>> $1${reset}" + echo "" +} + +info() { + echo "${cyan} $1${reset}" +} + +warn() { + echo "${yellow} WARNING: $1${reset}" +} + +error() { + echo "${red} ERROR: $1${reset}" +} + +run_cmd() { + echo "${magenta} \$ $*${reset}" + "$@" +} + +pause() { + if [[ "${INTERACTIVE}" == "true" ]]; then + echo "" + read -r -p "${yellow}Press ENTER to continue...${reset}" + echo "" + fi +} + +confirm() { + if [[ "${INTERACTIVE}" != "true" ]]; then + # Auto-confirm in non-interactive mode + info "Auto-confirming: $1" + return 0 + fi + echo "" + read -r -p "${yellow}$1 [y/N] ${reset}" response + case "$response" in + [yY][eE][sS]|[yY]) return 0 ;; + *) return 1 ;; + esac +} + +# Confirm for destructive operations (skipped in non-interactive mode if SKIP_DESTRUCTIVE=true) +confirm_destructive() { + if [[ "${INTERACTIVE}" != "true" && "${SKIP_DESTRUCTIVE}" == "true" ]]; then + info "Skipping destructive operation in non-interactive mode: $1" + return 1 + fi + confirm "$1" +} + +check_prereqs() { + local missing=() + + command -v kubectl &>/dev/null || missing+=("kubectl") + command -v helm &>/dev/null || missing+=("helm") + command -v docker &>/dev/null || missing+=("docker") + + if [[ ${#missing[@]} -gt 0 ]]; then + error "Missing prerequisites: ${missing[*]}" + exit 1 + fi + + # Check for buildx (required for cross-platform builds) + if ! docker buildx version &>/dev/null; then + warn "docker buildx not available - cross-platform builds may fail" + warn "Run: docker buildx create --use --name multiarch" + else + info "Docker buildx: $(docker buildx version | head -1)" + fi +} + +# ============================================================================== +# Demo Sections +# ============================================================================== + +show_intro() { + [[ "${INTERACTIVE}" == "true" ]] && clear + banner "NVML Provider Sidecar Architecture Demo" + + echo "${white}This demo showcases the sidecar-based NVML provider for device-api-server.${reset}" + echo "" + echo "${white}Architecture:${reset}" + echo "${cyan} ┌─────────────────────────────────────────────────────────┐${reset}" + echo "${cyan} │ Pod │${reset}" + echo "${cyan} │ ┌──────────────────┐ ┌──────────────────┐ │${reset}" + echo "${cyan} │ │ device-api-server│ │ nvml-provider │ │${reset}" + echo "${cyan} │ │ (pure Go) │◄───│ (CGO + NVML) │ │${reset}" + echo "${cyan} │ │ Unix Socket │gRPC│ Health :8082 │ │${reset}" + echo "${cyan} │ │ Health :8081 │ │ RuntimeClass: │ │${reset}" + echo "${cyan} │ │ Metrics :9090 │ │ nvidia │ │${reset}" + echo "${cyan} │ └──────────────────┘ └──────────────────┘ │${reset}" + echo "${cyan} └─────────────────────────────────────────────────────────┘${reset}" + echo "" + echo "${white}Benefits:${reset}" + echo "${green} ✓ Separation of concerns (API server vs NVML access)${reset}" + echo "${green} ✓ Independent scaling and updates${reset}" + echo "${green} ✓ Better testability (mock providers)${reset}" + echo "${green} ✓ Crash isolation (NVML crashes don't kill API server)${reset}" + echo "" + + pause +} + +show_config() { + banner "Configuration" + + echo "${white}Current settings (override via environment variables):${reset}" + echo "" + echo "${cyan} Kubernetes:${reset}" + echo " KUBECONFIG = ${KUBECONFIG}" + echo " NAMESPACE = ${NAMESPACE}" + echo " RELEASE_NAME = ${RELEASE_NAME}" + echo "" + echo "${cyan} Paths:${reset}" + echo " CHART_PATH = ${CHART_PATH}" + echo " VALUES_FILE = ${VALUES_FILE}" + echo " DOCKERFILE = ${DOCKERFILE}" + echo "" + echo "${cyan} Images:${reset}" + echo " IMAGE_REGISTRY = ${IMAGE_REGISTRY}" + echo " IMAGE_TAG = ${IMAGE_TAG}" + echo " SERVER_IMAGE = ${SERVER_IMAGE}" + echo " SIDECAR_IMAGE = ${SIDECAR_IMAGE}" + echo "" + echo "${cyan} Build:${reset}" + echo " BUILD_PLATFORM = ${BUILD_PLATFORM}" + echo "" + echo "${cyan} Cluster:${reset}" + echo " GPU_NODE_SELECTOR = ${GPU_NODE_SELECTOR}" + echo "" + echo "${cyan} Helm Chart:${reset}" + echo " APP_NAME = ${APP_NAME}" + echo " CONTAINER_NAME = ${CONTAINER_NAME}" + echo " SIDECAR_CONTAINER_NAME = ${SIDECAR_CONTAINER_NAME}" + echo "" + echo "${cyan} Mode:${reset}" + echo " INTERACTIVE = ${INTERACTIVE}" + echo " SKIP_DESTRUCTIVE = ${SKIP_DESTRUCTIVE}" + echo " SKIP_BUILD = ${SKIP_BUILD}" + echo "" + + pause +} + +show_cluster_info() { + banner "Step 1: Verify Cluster Connectivity" + + step "Check cluster connection" + run_cmd kubectl --kubeconfig="${KUBECONFIG}" cluster-info + + pause + + step "List GPU nodes (selector: ${GPU_NODE_SELECTOR})" + run_cmd kubectl --kubeconfig="${KUBECONFIG}" get nodes -l "${GPU_NODE_SELECTOR}" -o wide || { + warn "No nodes found with selector '${GPU_NODE_SELECTOR}'" + info "Listing all nodes instead:" + run_cmd kubectl --kubeconfig="${KUBECONFIG}" get nodes -o wide + } + + pause + + step "Verify nvidia RuntimeClass exists" + run_cmd kubectl --kubeconfig="${KUBECONFIG}" get runtimeclass nvidia -o yaml || { + warn "RuntimeClass 'nvidia' not found. GPU access may not work." + } + + pause +} + +check_image_exists() { + local image="$1" + # Try to inspect the manifest - if it exists, the image is available + docker buildx imagetools inspect "${image}" &>/dev/null 2>&1 +} + +build_images() { + banner "Step 2: Build Container Images" + + if [[ "${SKIP_BUILD}" == "true" ]]; then + info "SKIP_BUILD=true, skipping image builds" + info "Using pre-built images:" + info " SERVER_IMAGE: ${SERVER_IMAGE}" + info " SIDECAR_IMAGE: ${SIDECAR_IMAGE}" + return 0 + fi + + info "Building images for registry: ${IMAGE_REGISTRY}" + info "Using unified multi-target Dockerfile at ${DOCKERFILE}" + info "Target platform: ${BUILD_PLATFORM}" + echo "" + + # Ensure buildx is available for cross-platform builds + if ! docker buildx version &>/dev/null; then + error "docker buildx is required for cross-platform builds" + error "Install Docker Desktop or run: docker buildx create --use" + exit 1 + fi + + # Check if images already exist + local need_server=true + local need_sidecar=true + + if check_image_exists "${SERVER_IMAGE}"; then + info "Image ${SERVER_IMAGE} already exists" + if ! confirm "Rebuild device-api-server image?"; then + need_server=false + fi + fi + + if check_image_exists "${SIDECAR_IMAGE}"; then + info "Image ${SIDECAR_IMAGE} already exists" + if ! confirm "Rebuild device-api-server-sidecar image?"; then + need_sidecar=false + fi + fi + + if [[ "${need_server}" == "true" ]]; then + step "Build and push device-api-server image (CGO_ENABLED=0)" + info "This is a pure Go binary with no NVML dependencies" + info "Building for ${BUILD_PLATFORM} and pushing directly..." + run_cmd docker buildx build \ + --platform "${BUILD_PLATFORM}" \ + --target device-api-server \ + -t "${SERVER_IMAGE}" \ + -f "${DOCKERFILE}" \ + --push \ + . + pause + else + info "Skipping device-api-server build" + fi + + if [[ "${need_sidecar}" == "true" ]]; then + step "Build and push device-api-server-sidecar image (CGO_ENABLED=1)" + info "This is the NVML provider sidecar with glibc runtime" + info "Building for ${BUILD_PLATFORM} and pushing directly..." + run_cmd docker buildx build \ + --platform "${BUILD_PLATFORM}" \ + --target nvml-provider \ + -t "${SIDECAR_IMAGE}" \ + -f "${DOCKERFILE}" \ + --push \ + . + pause + else + info "Skipping device-api-server-sidecar build" + fi +} + +show_values_file() { + banner "Step 3: Review Helm Values" + + info "The sidecar architecture is enabled via Helm values" + echo "" + + step "Key configuration in ${VALUES_FILE}:" + echo "" + echo "${cyan}# Disable built-in NVML provider${reset}" + echo "${white}nvml:${reset}" + echo "${white} enabled: false${reset}" + echo "" + echo "${cyan}# Enable NVML Provider sidecar${reset}" + echo "${white}nvmlProvider:${reset}" + echo "${white} enabled: true${reset}" + echo "${white} image:${reset}" + echo "${white} repository: ${IMAGE_REGISTRY}/device-api-server-sidecar${reset}" + echo "${white} tag: \"${IMAGE_TAG}\"${reset}" + echo "${white} # Sidecar connects via shared unix socket volume${reset}" + echo "${white} runtimeClassName: nvidia${reset}" + echo "" + + if [[ -f "${VALUES_FILE}" ]]; then + step "Full values file:" + run_cmd cat "${VALUES_FILE}" + fi + + pause +} + +deploy_sidecar() { + banner "Step 4: Deploy with Sidecar Architecture" + + step "Create namespace if not exists" + echo "${magenta} \$ kubectl create namespace ${NAMESPACE} --dry-run=client -o yaml | kubectl apply -f -${reset}" + kubectl --kubeconfig="${KUBECONFIG}" create namespace "${NAMESPACE}" --dry-run=client -o yaml | \ + kubectl --kubeconfig="${KUBECONFIG}" apply -f - + + pause + + # Check if release already exists + # Build --set overrides to ensure Helm uses the same images we just built, + # regardless of what the values file says. + IMAGE_OVERRIDES=( + --set "image.repository=${IMAGE_REGISTRY}/device-api-server" + --set "image.tag=${IMAGE_TAG}" + --set "nvmlProvider.image.repository=${IMAGE_REGISTRY}/device-api-server-sidecar" + --set "nvmlProvider.image.tag=${IMAGE_TAG}" + ) + + if helm status "${RELEASE_NAME}" --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" &>/dev/null; then + info "Release '${RELEASE_NAME}' already exists" + step "Upgrading existing release..." + run_cmd helm upgrade "${RELEASE_NAME}" "${CHART_PATH}" \ + --kubeconfig="${KUBECONFIG}" \ + --namespace "${NAMESPACE}" \ + -f "${VALUES_FILE}" \ + "${IMAGE_OVERRIDES[@]}" + + step "Restarting pods to pick up changes..." + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" rollout restart daemonset "${RELEASE_NAME}" + else + step "Installing new release..." + run_cmd helm install "${RELEASE_NAME}" "${CHART_PATH}" \ + --kubeconfig="${KUBECONFIG}" \ + --namespace "${NAMESPACE}" \ + -f "${VALUES_FILE}" \ + "${IMAGE_OVERRIDES[@]}" + fi + + pause + + step "Waiting for pods to be ready (timeout 2m)..." + if ! kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" rollout status daemonset "${RELEASE_NAME}" --timeout=2m; then + warn "Rollout not complete within timeout. Checking status..." + fi + + step "Current pod status" + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} -o wide + + pause + + step "Verify both containers are running in each pod" + info "Each pod should have 2/2 containers ready" + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} \ + -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\t"}{range .status.containerStatuses[*]}{.name}:{.ready}{" "}{end}{"\n"}{end}' + + pause +} + +verify_gpu_registration() { + banner "Step 5: Verify GPU Registration" + + step "Wait for pods to be ready" + info "Waiting up to 60 seconds for pods to start..." + if ! kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" wait --for=condition=ready pod -l app.kubernetes.io/name=${APP_NAME} --timeout=60s 2>/dev/null; then + warn "Pods may not be ready yet. Checking status..." + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} -o wide + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" describe pods -l app.kubernetes.io/name=${APP_NAME} | tail -30 + error "Pods not ready. Check the output above for issues." + return 1 + fi + + pause + + step "Verify DaemonSet coverage on all GPU nodes" + local gpu_nodes_ready + local gpu_nodes_total + local daemonset_desired + local daemonset_ready + + gpu_nodes_total=$(kubectl --kubeconfig="${KUBECONFIG}" get nodes -l "${GPU_NODE_SELECTOR}" --no-headers 2>/dev/null | wc -l | tr -d ' ') + gpu_nodes_ready=$(kubectl --kubeconfig="${KUBECONFIG}" get nodes -l "${GPU_NODE_SELECTOR}" --no-headers 2>/dev/null | grep -c " Ready" || true) + # Ensure gpu_nodes_ready is a valid number (grep -c returns 0 with exit code 1 when no matches) + [[ -z "${gpu_nodes_ready}" ]] && gpu_nodes_ready=0 + daemonset_desired=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get daemonset "${RELEASE_NAME}" -o jsonpath='{.status.desiredNumberScheduled}' 2>/dev/null || echo "0") + daemonset_ready=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get daemonset "${RELEASE_NAME}" -o jsonpath='{.status.numberReady}' 2>/dev/null || echo "0") + + echo "" + info "GPU Nodes (total): ${gpu_nodes_total}" + info "GPU Nodes (Ready): ${gpu_nodes_ready}" + info "DaemonSet (desired): ${daemonset_desired}" + info "DaemonSet (ready): ${daemonset_ready}" + echo "" + + if [[ "${daemonset_ready}" -eq "${gpu_nodes_ready}" && "${daemonset_ready}" -gt 0 ]]; then + echo "${green} ✓ DaemonSet running on all ${daemonset_ready} Ready GPU nodes${reset}" + else + warn "DaemonSet coverage mismatch! Expected ${gpu_nodes_ready} pods, got ${daemonset_ready}" + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get daemonset "${RELEASE_NAME}" + fi + + pause + + step "List all pods and their nodes" + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} -o wide + + pause + + step "Get a pod name for testing" + POD=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} -o jsonpath='{.items[0].metadata.name}') + if [[ -z "${POD}" ]]; then + error "No pods found. DaemonSet may not be scheduling on any nodes." + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get daemonset + return 1 + fi + NODE=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pod "${POD}" -o jsonpath='{.spec.nodeName}') + info "Using pod: ${POD} (on node: ${NODE})" + + pause + + step "Check device-api-server logs for provider connection" + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" logs "${POD}" -c "${CONTAINER_NAME}" --tail=20 || true + + pause + + step "Check nvml-provider sidecar logs" + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" logs "${POD}" -c "${SIDECAR_CONTAINER_NAME}" --tail=20 || true + + pause + + verify_gpu_uuid_match "${POD}" "${NODE}" +} + +verify_gpu_uuid_match() { + local pod="$1" + local node="$2" + + banner "Step 5b: Verify GPU UUID Match" + + info "Comparing GPU UUIDs from nvidia-smi with device-api-server registered GPUs" + info "Pod: ${pod} | Node: ${node}" + echo "" + + step "Get GPU UUID from nvidia-smi on the node (via sidecar container)" + local nvidia_smi_uuids + nvidia_smi_uuids=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" exec "${pod}" -c "${SIDECAR_CONTAINER_NAME}" -- \ + nvidia-smi --query-gpu=uuid --format=csv,noheader 2>/dev/null || echo "") + + if [[ -z "${nvidia_smi_uuids}" ]]; then + warn "Could not get GPU UUIDs from nvidia-smi" + return 1 + fi + + echo "${cyan} nvidia-smi GPU UUIDs:${reset}" + echo "${nvidia_smi_uuids}" | while read -r uuid; do + echo " - ${uuid}" + done + echo "" + + pause + + step "Get registered GPU UUIDs from device-api-server logs" + local registered_uuids + registered_uuids=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" logs "${pod}" -c "${SIDECAR_CONTAINER_NAME}" 2>/dev/null | \ + grep -o 'uuid="GPU-[^"]*"' | sed 's/uuid="//;s/"$//' | sort -u || echo "") + + if [[ -z "${registered_uuids}" ]]; then + warn "Could not find registered GPU UUIDs in logs" + return 1 + fi + + echo "${cyan} Registered GPU UUIDs:${reset}" + echo "${registered_uuids}" | while read -r uuid; do + echo " - ${uuid}" + done + echo "" + + pause + + step "Compare UUIDs" + local match_count=0 + local total_count=0 + + while read -r smi_uuid; do + [[ -z "${smi_uuid}" ]] && continue + total_count=$((total_count + 1)) + if echo "${registered_uuids}" | grep -q "${smi_uuid}"; then + echo "${green} ✓ ${smi_uuid} - MATCHED${reset}" + match_count=$((match_count + 1)) + else + echo "${red} ✗ ${smi_uuid} - NOT FOUND in registered GPUs${reset}" + fi + done <<< "${nvidia_smi_uuids}" + + echo "" + if [[ "${match_count}" -eq "${total_count}" && "${total_count}" -gt 0 ]]; then + echo "${green} ✓ All ${total_count} GPU(s) from nvidia-smi are registered in device-api-server${reset}" + else + warn "UUID mismatch: ${match_count}/${total_count} GPUs matched" + fi + + pause +} + +demonstrate_crash_recovery() { + banner "Step 6: Demonstrate Crash Recovery" + + info "The sidecar architecture provides crash isolation." + info "If the NVML provider crashes, the API server continues running" + info "and will reconnect when the provider restarts." + echo "" + + step "Get current pod" + POD=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} -o jsonpath='{.items[0].metadata.name}') + info "Using pod: ${POD}" + + pause + + if confirm_destructive "Kill the nvml-provider container to demonstrate crash recovery?"; then + step "Killing nvml-provider container..." + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" exec "${POD}" -c "${SIDECAR_CONTAINER_NAME}" -- kill 1 || true + + info "Waiting for container restart..." + sleep 5 + + step "Check pod status (should show restart count)" + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pod "${POD}" -o wide + + step "Verify API server continued running" + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" logs "${POD}" -c "${CONTAINER_NAME}" --tail=10 || true + + step "Verify provider reconnected" + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" logs "${POD}" -c "${SIDECAR_CONTAINER_NAME}" --tail=10 || true + else + info "Skipping crash recovery demonstration" + fi + + pause +} + +show_metrics() { + banner "Step 7: View Provider Metrics" + + step "Get pod for port-forward" + POD=$(kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" get pods -l app.kubernetes.io/name=${APP_NAME} -o jsonpath='{.items[0].metadata.name}') + + step "Fetch metrics from the API server" + info "Key metrics to look for:" + info " - device_apiserver_service_status: Whether services are serving" + info " - device_apiserver_build_info: Build information" + info " - grpc_server_*: gRPC request/stream metrics" + echo "" + + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" exec "${POD}" -c "${CONTAINER_NAME}" -- \ + wget -qO- http://localhost:9090/metrics 2>/dev/null | grep -E "^(device_apiserver_|grpc_server_handled_total)" | sort || { + run_cmd kubectl --kubeconfig="${KUBECONFIG}" -n "${NAMESPACE}" exec "${POD}" -c "${CONTAINER_NAME}" -- \ + curl -s http://localhost:9090/metrics 2>/dev/null | grep -E "^(device_apiserver_|grpc_server_handled_total)" | sort || true + } + + pause +} + +cleanup() { + banner "Cleanup" + + if confirm_destructive "Remove the sidecar deployment and restore default?"; then + step "Uninstalling Helm release..." + run_cmd helm uninstall "${RELEASE_NAME}" \ + --kubeconfig="${KUBECONFIG}" \ + --namespace "${NAMESPACE}" || true + + info "Cleanup complete!" + else + info "Skipping cleanup. Release '${RELEASE_NAME}' left in namespace '${NAMESPACE}'" + fi +} + +show_summary() { + banner "Demo Complete!" + + echo "${white}What we demonstrated:${reset}" + echo "${green} ✓ Built separate images for device-api-server and device-api-server-sidecar${reset}" + echo "${green} ✓ Deployed as sidecar architecture via Helm${reset}" + echo "${green} ✓ Verified DaemonSet runs on ALL GPU nodes${reset}" + echo "${green} ✓ Verified GPU UUIDs match between nvidia-smi and device-api-server${reset}" + echo "${green} ✓ Showed crash isolation and recovery${reset}" + echo "${green} ✓ Explored provider metrics${reset}" + echo "" + echo "${white}Images built:${reset}" + echo "${cyan} - ${SERVER_IMAGE}${reset}" + echo "${cyan} - ${SIDECAR_IMAGE}${reset}" + echo "" + echo "${white}Key files:${reset}" + echo "${cyan} - ${DOCKERFILE} # Multi-target container build${reset}" + echo "${cyan} - ${VALUES_FILE} # Helm values for sidecar mode${reset}" + echo "${cyan} - ${CHART_PATH}/ # Helm chart with sidecar support${reset}" + echo "" + echo "${white}Environment variables for customization:${reset}" + echo "${cyan} KUBECONFIG, NAMESPACE, RELEASE_NAME, IMAGE_REGISTRY, IMAGE_TAG,${reset}" + echo "${cyan} SERVER_IMAGE, SIDECAR_IMAGE, BUILD_PLATFORM, GPU_NODE_SELECTOR,${reset}" + echo "${cyan} CHART_PATH, VALUES_FILE, DOCKERFILE${reset}" + echo "" +} + +# ============================================================================== +# Main +# ============================================================================== + +main() { + export KUBECONFIG + + show_intro + show_config + check_prereqs + show_cluster_info + + if confirm "Build and push container images?"; then + build_images + else + info "Skipping image build. Using existing images at ${IMAGE_REGISTRY}" + fi + + show_values_file + + if confirm "Deploy the sidecar architecture to the cluster?"; then + deploy_sidecar + verify_gpu_registration + demonstrate_crash_recovery + show_metrics + cleanup + else + info "Skipping deployment" + fi + + show_summary +} + +# Run main if script is executed (not sourced) +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/deployments/container/Dockerfile b/deployments/container/Dockerfile new file mode 100644 index 000000000..d322f3a2f --- /dev/null +++ b/deployments/container/Dockerfile @@ -0,0 +1,190 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Multi-target Dockerfile for NVSentinel components +# +# Targets: +# device-api-server - Pure Go server (no NVML, uses sidecar provider) +# nvml-provider - NVML provider sidecar (CGO, requires RuntimeClass nvidia) +# +# Build examples: +# # Build device-api-server (default, pure Go) +# docker build --target device-api-server -t nvsentinel/device-api-server . +# +# # Build nvml-provider sidecar +# docker build --target nvml-provider -t nvsentinel/nvml-provider . +# +# Note: NVML provider requires glibc runtime (Debian) for RTLD_DEEPBIND support + +# TODO: Add Cosign image signing and SBOM generation to CI/CD pipeline. +# See: https://docs.sigstore.dev/signing/quickstart/ +# Steps: +# 1. Sign images with cosign: cosign sign --key +# 2. Generate SBOM: syft -o cyclonedx-json > sbom.json +# 3. Attach SBOM: cosign attach sbom --sbom sbom.json + +# ============================================================================== +# Build Arguments +# ============================================================================== + +ARG GOLANG_VERSION=1.25 +ARG VERSION=dev +ARG GIT_COMMIT=unknown +ARG GIT_TREE_STATE=dirty +ARG BUILD_DATE + +# ============================================================================== +# Base Builder - Pure Go (Alpine) +# ============================================================================== + +FROM golang:${GOLANG_VERSION}-alpine AS builder-alpine + +ARG VERSION +ARG GIT_COMMIT +ARG GIT_TREE_STATE +ARG BUILD_DATE + +WORKDIR /workspace + +# Install build dependencies +RUN apk add --no-cache git make + +# Copy go mod files first for caching +COPY go.mod go.sum ./ + +# Download dependencies +RUN go mod download + +# Copy source code +COPY . . + +# Version package path +ARG VERSION_PKG=github.com/nvidia/nvsentinel/pkg/version + +# Build device-api-server (CGO disabled, pure Go) +RUN CGO_ENABLED=0 GOOS=linux go build \ + -ldflags "-s -w \ + -X ${VERSION_PKG}.Version=${VERSION} \ + -X ${VERSION_PKG}.GitCommit=${GIT_COMMIT} \ + -X ${VERSION_PKG}.GitTreeState=${GIT_TREE_STATE} \ + -X ${VERSION_PKG}.BuildDate=${BUILD_DATE}" \ + -o /build/device-api-server \ + ./cmd/device-api-server + +# ============================================================================== +# Base Builder - CGO (Debian/glibc) +# ============================================================================== + +FROM golang:${GOLANG_VERSION}-bookworm AS builder-debian + +ARG VERSION +ARG GIT_COMMIT +ARG GIT_TREE_STATE +ARG BUILD_DATE + +WORKDIR /workspace + +# Install build dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + make \ + && rm -rf /var/lib/apt/lists/* + +# Copy go mod files first for caching +COPY go.mod go.sum ./ + +# Download dependencies +RUN go mod download + +# Copy source code +COPY . . + +# Version package path +ARG VERSION_PKG=github.com/nvidia/nvsentinel/pkg/version + +# Build nvml-provider (CGO enabled for go-nvml) +RUN CGO_ENABLED=1 go build \ + -tags=nvml \ + -ldflags "-s -w \ + -X ${VERSION_PKG}.Version=${VERSION} \ + -X ${VERSION_PKG}.GitCommit=${GIT_COMMIT} \ + -X ${VERSION_PKG}.GitTreeState=${GIT_TREE_STATE} \ + -X ${VERSION_PKG}.BuildDate=${BUILD_DATE}" \ + -o /build/nvml-provider \ + ./cmd/nvml-provider + +# ============================================================================== +# Target: device-api-server +# ============================================================================== +# Pure Go server with no NVML dependencies. Uses sidecar provider for GPU access. +# Small image size, fast startup, works on any architecture. + +# Pinned to digest for reproducible builds. Update with: +# docker manifest inspect alpine:3.21 | jq '.manifests[] | select(.platform.architecture=="amd64") | .digest' +FROM alpine:3.21@sha256:22e0ec13c0db6b3e1ba3280e831fc50ba7bffe58e81f31670a64b1afede247bc AS device-api-server + +LABEL org.opencontainers.image.source="https://github.com/nvidia/nvsentinel" +LABEL org.opencontainers.image.description="NVSentinel Device API Server - Node-local GPU device state cache" +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.title="device-api-server" + +# Add ca-certificates for HTTPS +RUN apk add --no-cache ca-certificates + +WORKDIR / + +COPY --from=builder-alpine --chmod=755 /build/device-api-server /device-api-server + +# Run as non-root user (nobody) +USER 65534:65534 + +# Health probe port (configurable via --health-probe-bind-address) +EXPOSE 8081 +# Metrics port (configurable via --metrics-bind-address) +EXPOSE 9090 + +ENTRYPOINT ["/device-api-server"] + +# ============================================================================== +# Target: nvml-provider +# ============================================================================== +# NVML provider sidecar for GPU enumeration and health monitoring. +# Requires glibc runtime (Debian) for RTLD_DEEPBIND support. +# Must run with RuntimeClass: nvidia to access NVML libraries. + +# Pinned to digest for reproducible builds. Update with: +# docker manifest inspect debian:bookworm-slim | jq '.manifests[] | select(.platform.architecture=="amd64") | .digest' +FROM debian:bookworm-slim@sha256:6458e6ce2b6448e31bfdced4be7d8aa88d389e6694ab09f5a718a694abe147f4 AS nvml-provider + +LABEL org.opencontainers.image.source="https://github.com/nvidia/nvsentinel" +LABEL org.opencontainers.image.description="NVSentinel NVML Provider - GPU enumeration and health monitoring sidecar" +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.title="nvml-provider" + +# Add ca-certificates for HTTPS +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR / + +COPY --from=builder-debian --chmod=755 /build/nvml-provider /nvml-provider + +# Run as non-root user +USER 65534:65534 + +# Health check port +EXPOSE 8082 + +ENTRYPOINT ["/nvml-provider"] diff --git a/deployments/helm/device-api-server/Chart.yaml b/deployments/helm/device-api-server/Chart.yaml new file mode 100644 index 000000000..10f76a543 --- /dev/null +++ b/deployments/helm/device-api-server/Chart.yaml @@ -0,0 +1,51 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: device-api-server +description: | + Device API Server - Node-local GPU device state cache server for Kubernetes. + + The Device API Server acts as an intermediary between providers (health monitors) + that update GPU device states and consumers (device plugins, DRA drivers) that + read device states for scheduling decisions. + + Key features: + - Read-blocking semantics during provider updates + - Sidecar architecture for NVML isolation + - Multiple provider and consumer support + - Prometheus metrics and alerting + - Health-based GPU scheduling decisions +type: application +version: 0.1.0 +appVersion: "0.1.0" +kubeVersion: ">=1.25.0-0" +keywords: + - nvidia + - gpu + - device + - nvml + - health + - daemonset + - grpc +home: https://github.com/nvidia/nvsentinel +sources: + - https://github.com/nvidia/nvsentinel +maintainers: + - name: NVIDIA + url: https://github.com/nvidia +icon: https://www.nvidia.com/favicon.ico +annotations: + artifacthub.io/license: Apache-2.0 + artifacthub.io/category: monitoring-logging diff --git a/deployments/helm/device-api-server/README.md b/deployments/helm/device-api-server/README.md new file mode 100644 index 000000000..b8990a413 --- /dev/null +++ b/deployments/helm/device-api-server/README.md @@ -0,0 +1,263 @@ +# Device API Server Helm Chart + +Node-local GPU device state cache server for Kubernetes. + +## Introduction + +The Device API Server is a DaemonSet that runs on each GPU node, providing a local gRPC cache for GPU device states. It acts as an intermediary between: + +- **Providers** (health monitors) that update GPU device states +- **Consumers** (device plugins, DRA drivers) that read device states for scheduling decisions + +Key features: + +- Read-blocking semantics during provider updates +- Multiple provider and consumer support +- Optional NVML fallback provider for GPU enumeration and XID monitoring +- Prometheus metrics and alerting +- Unix socket for node-local communication + +## Prerequisites + +- Kubernetes 1.25+ +- Helm 3.0+ +- (Optional) NVIDIA GPU Operator for NVML provider support +- (Optional) Prometheus Operator for ServiceMonitor/PrometheusRule + +## Installation + +### Quick Start + +```bash +# Add the Helm repository (when published) +helm repo add nvsentinel https://nvidia.github.io/nvsentinel +helm repo update + +# Install with default configuration +helm install device-api-server nvsentinel/device-api-server \ + --namespace device-api --create-namespace +``` + +### Install from Local Chart + +```bash +helm install device-api-server ./deployments/helm/device-api-server \ + --namespace device-api --create-namespace +``` + +### Install with NVML Provider + +To enable built-in GPU enumeration and health monitoring via NVML: + +```bash +helm install device-api-server ./deployments/helm/device-api-server \ + --namespace device-api --create-namespace \ + --set nvmlProvider.enabled=true +``` + +> **Note**: NVML provider requires the `nvidia` RuntimeClass. Install the NVIDIA GPU Operator or create it manually. + +### Install with Prometheus Monitoring + +```bash +helm install device-api-server ./deployments/helm/device-api-server \ + --namespace device-api --create-namespace \ + --set metrics.serviceMonitor.enabled=true \ + --set metrics.prometheusRule.enabled=true +``` + +## Configuration + +See [values.yaml](values.yaml) for the full list of configurable parameters. + +### Key Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `image.repository` | Image repository | `ghcr.io/nvidia/device-api-server` | +| `image.tag` | Image tag | Chart appVersion | +| `server.grpcAddress` | gRPC server address | `:50051` | +| `server.unixSocket` | Unix socket path | `/var/run/device-api/device.sock` | +| `server.healthPort` | Health endpoint port | `8081` | +| `server.metricsPort` | Metrics endpoint port | `9090` | +| `nvmlProvider.enabled` | Enable NVML provider sidecar | `false` | +| `nvmlProvider.driverRoot` | NVIDIA driver library root | `/run/nvidia/driver` | +| `nvmlProvider.healthCheckEnabled` | Enable XID event monitoring | `true` | +| `runtimeClassName` | Pod RuntimeClass | `""` | +| `nodeSelector` | Node selector | `nvidia.com/gpu.present: "true"` | +| `metrics.serviceMonitor.enabled` | Create ServiceMonitor | `false` | +| `metrics.prometheusRule.enabled` | Create PrometheusRule | `false` | + +### Resource Configuration + +```yaml +resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi +``` + +### NVML Provider Configuration + +```yaml +nvmlProvider: + enabled: true + driverRoot: /run/nvidia/driver + healthCheckEnabled: true +``` + +Default ignored XIDs (application errors): 13, 31, 43, 45, 68, 109 + +### Node Scheduling + +By default, the DaemonSet schedules only on nodes with `nvidia.com/gpu.present=true` label: + +```yaml +nodeSelector: + nvidia.com/gpu.present: "true" + +tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule +``` + +Override for custom environments: + +```bash +helm install device-api-server ./deployments/helm/device-api-server \ + --set 'nodeSelector.node-type=gpu' \ + --set 'nodeSelector.nvidia\.com/gpu\.present=null' +``` + +## Metrics + +The server exposes Prometheus metrics at `/metrics` on the configured `metricsPort`. + +### Available Metrics + +| Metric | Type | Description | +|--------|------|-------------| +| `device_api_server_info` | Gauge | Server information | +| `device_api_server_cache_gpus_total` | Gauge | Total GPUs in cache | +| `device_api_server_cache_gpus_healthy` | Gauge | Healthy GPUs | +| `device_api_server_cache_gpus_unhealthy` | Gauge | Unhealthy GPUs | +| `device_api_server_cache_updates_total` | Counter | Cache update operations | +| `device_api_server_watch_streams_active` | Gauge | Active watch streams | +| `device_api_server_watch_events_total` | Counter | Watch events sent | +| `device_api_server_nvml_provider_enabled` | Gauge | NVML provider status | +| `device_api_server_nvml_gpu_count` | Gauge | GPUs discovered by NVML | + +### Alerting Rules + +When `metrics.prometheusRule.enabled=true`, the following alerts are configured: + +| Alert | Severity | Description | +|-------|----------|-------------| +| `DeviceAPIServerDown` | Critical | Server unreachable for 5m | +| `DeviceAPIServerHighLatency` | Warning | P99 latency > 500ms | +| `DeviceAPIServerHighErrorRate` | Warning | Error rate > 10% | +| `DeviceAPIServerUnhealthyGPUs` | Warning | Unhealthy GPUs detected | +| `DeviceAPIServerNoGPUs` | Warning | No GPUs registered for 10m | +| `DeviceAPIServerNVMLProviderDown` | Warning | NVML provider not running | + +## Client Connection + +Clients on the same node can connect via: + +### Unix Socket (Recommended) + +```go +conn, err := grpc.Dial( + "unix:///var/run/device-api/device.sock", + grpc.WithInsecure(), +) +``` + +### TCP + +```go +conn, err := grpc.Dial( + "localhost:50051", + grpc.WithInsecure(), +) +``` + +### grpcurl Examples + +```bash +# List available services +grpcurl -plaintext localhost:50051 list + +# List GPUs +grpcurl -plaintext localhost:50051 nvidia.device.v1alpha1.GpuService/ListGpus + +# Watch GPU changes +grpcurl -plaintext localhost:50051 nvidia.device.v1alpha1.GpuService/WatchGpus +``` + +## Upgrading + +```bash +helm upgrade device-api-server ./deployments/helm/device-api-server \ + --namespace device-api \ + --reuse-values \ + --set image.tag=v0.2.0 +``` + +## Uninstallation + +```bash +helm uninstall device-api-server --namespace device-api +``` + +## Troubleshooting + +### Pod Not Scheduling + +Check node labels: + +```bash +kubectl get nodes --show-labels | grep gpu +``` + +Ensure nodes have `nvidia.com/gpu.present=true` or override `nodeSelector`. + +### NVML Provider Fails to Start + +1. Verify RuntimeClass exists: + + ```bash + kubectl get runtimeclass nvidia + ``` + +2. Check NVIDIA driver is installed on nodes: + + ```bash + kubectl debug node/ -it --image=nvidia/cuda:12.0-base -- nvidia-smi + ``` + +3. Check pod logs for NVML errors: + + ```bash + kubectl logs -n device-api -l app.kubernetes.io/name=device-api-server + ``` + +### Permission Denied on Unix Socket + +If using custom security contexts, ensure the socket directory is writable: + +```yaml +securityContext: + runAsUser: 0 # May be needed for hostPath access + runAsNonRoot: false +``` + +## License + +Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + +Licensed under the Apache License, Version 2.0. diff --git a/deployments/helm/device-api-server/chart_test.go b/deployments/helm/device-api-server/chart_test.go new file mode 100644 index 000000000..cc5a42864 --- /dev/null +++ b/deployments/helm/device-api-server/chart_test.go @@ -0,0 +1,181 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chart_test + +import ( + "os" + "os/exec" + "strings" + "testing" +) + +// chartDir returns the path to the Helm chart directory. +func chartDir(t *testing.T) string { + t.Helper() + // When running from the chart directory itself + if _, err := os.Stat("Chart.yaml"); err == nil { + wd, _ := os.Getwd() + return wd + } + t.Fatal("Chart.yaml not found; run tests from the chart directory") + return "" +} + +// helmTemplate runs helm template with optional --set overrides and returns stdout. +func helmTemplate(t *testing.T, sets ...string) string { + t.Helper() + args := []string{"template", "test-release", chartDir(t)} + for _, s := range sets { + args = append(args, "--set", s) + } + cmd := exec.Command("helm", args...) + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("helm template failed: %v\n%s", err, string(out)) + } + return string(out) +} + +func TestChart_DefaultRenders(t *testing.T) { + out := helmTemplate(t) + if len(out) == 0 { + t.Fatal("helm template produced no output") + } + // Should contain a DaemonSet + if !strings.Contains(out, "kind: DaemonSet") { + t.Error("Expected DaemonSet in rendered output") + } + // Should contain a ServiceAccount + if !strings.Contains(out, "kind: ServiceAccount") { + t.Error("Expected ServiceAccount in rendered output") + } +} + +func TestChart_TerminationGracePeriod_Default(t *testing.T) { + out := helmTemplate(t) + // Default: shutdownDelay(5) + shutdownGracePeriod(25) + 5 = 35 + if !strings.Contains(out, "terminationGracePeriodSeconds: 35") { + t.Errorf("Expected terminationGracePeriodSeconds: 35 with defaults, got:\n%s", + extractLine(out, "terminationGracePeriodSeconds")) + } +} + +func TestChart_TerminationGracePeriod_CustomValues(t *testing.T) { + out := helmTemplate(t, + "server.shutdownDelay=10", + "server.shutdownGracePeriod=60", + ) + // 10 + 60 + 5 = 75 + if !strings.Contains(out, "terminationGracePeriodSeconds: 75") { + t.Errorf("Expected terminationGracePeriodSeconds: 75 with custom values, got:\n%s", + extractLine(out, "terminationGracePeriodSeconds")) + } +} + +func TestChart_NoNVMLSidecar_ByDefault(t *testing.T) { + out := helmTemplate(t) + if strings.Contains(out, "name: nvml-provider") { + t.Error("NVML provider sidecar should not be present by default") + } +} + +func TestChart_NVMLSidecar_WhenEnabled(t *testing.T) { + out := helmTemplate(t, "nvmlProvider.enabled=true") + if !strings.Contains(out, "name: nvml-provider") { + t.Error("NVML provider sidecar should be present when enabled") + } + // Should have NVIDIA_VISIBLE_DEVICES env var + if !strings.Contains(out, "NVIDIA_VISIBLE_DEVICES") { + t.Error("Expected NVIDIA_VISIBLE_DEVICES env var in nvml-provider sidecar") + } +} + +func TestChart_BindAddress(t *testing.T) { + out := helmTemplate(t) + // Default binds to unix socket + if !strings.Contains(out, "--bind-address=unix:///var/run/device-api/device.sock") { + t.Error("Expected default --bind-address=unix:///var/run/device-api/device.sock") + } +} + +func TestChart_SecurityContext(t *testing.T) { + out := helmTemplate(t) + if !strings.Contains(out, "readOnlyRootFilesystem: true") { + t.Error("Expected readOnlyRootFilesystem: true in security context") + } + if !strings.Contains(out, "runAsNonRoot: true") { + t.Error("Expected runAsNonRoot: true in security context") + } + if !strings.Contains(out, "allowPrivilegeEscalation: false") { + t.Error("Expected allowPrivilegeEscalation: false in security context") + } +} + +func TestChart_SocketVolume(t *testing.T) { + out := helmTemplate(t) + if !strings.Contains(out, "name: socket-dir") { + t.Error("Expected socket-dir volume") + } + if !strings.Contains(out, "/var/run/device-api") { + t.Error("Expected socket directory path /var/run/device-api") + } +} + +func TestChart_MetricsPort_WhenEnabled(t *testing.T) { + out := helmTemplate(t, "metrics.enabled=true") + if !strings.Contains(out, "name: metrics") { + t.Error("Expected metrics port when metrics are enabled") + } +} + +func TestChart_MetricsPort_WhenDisabled(t *testing.T) { + out := helmTemplate(t, "metrics.enabled=false") + // The metrics port should not appear in containerPort definitions + lines := strings.Split(out, "\n") + for i, line := range lines { + if strings.Contains(line, "name: metrics") && + i > 0 && strings.Contains(lines[i-1], "containerPort") { + t.Error("Metrics port should not be present when metrics are disabled") + } + } +} + +func TestChart_NodeSelector(t *testing.T) { + out := helmTemplate(t) + if !strings.Contains(out, "nvidia.com/gpu.present") { + t.Error("Expected GPU node selector by default") + } +} + +func TestChart_PreStopHook(t *testing.T) { + out := helmTemplate(t) + // preStop sleep should match shutdownDelay + if !strings.Contains(out, `command: ["sleep", "5"]`) { + // Try alternate format + if !strings.Contains(out, "sleep") { + t.Error("Expected preStop sleep hook") + } + } +} + +// extractLine returns the first line containing the given substring. +func extractLine(s, substr string) string { + for _, line := range strings.Split(s, "\n") { + if strings.Contains(line, substr) { + return strings.TrimSpace(line) + } + } + return "" +} diff --git a/deployments/helm/device-api-server/templates/NOTES.txt b/deployments/helm/device-api-server/templates/NOTES.txt new file mode 100644 index 000000000..bf22b58ef --- /dev/null +++ b/deployments/helm/device-api-server/templates/NOTES.txt @@ -0,0 +1,126 @@ +{{/* +Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} +=============================================================================== + NVIDIA Device API Server has been installed! +=============================================================================== + +Release: {{ .Release.Name }} +Namespace: {{ .Release.Namespace }} +Chart Version: {{ .Chart.Version }} +App Version: {{ .Chart.AppVersion }} + +------------------------------------------------------------------------------- + Configuration Summary +------------------------------------------------------------------------------- + +Unix Socket: {{ .Values.server.unixSocket }} +Health Port: {{ .Values.server.healthPort }} +Metrics Port: {{ .Values.server.metricsPort }} +{{- if .Values.nvmlProvider.enabled }} +NVML Provider Sidecar: Enabled + - Connects via: unix://{{ .Values.server.unixSocket }} + - Driver Root: {{ .Values.nvmlProvider.driverRoot }} + - Health Check: {{ .Values.nvmlProvider.healthCheckEnabled }} +{{- else }} +NVML Provider Sidecar: Disabled +{{- end }} + +------------------------------------------------------------------------------- + Verify Installation +------------------------------------------------------------------------------- + +1. Check that DaemonSet pods are running on GPU nodes: + + kubectl get pods -n {{ .Release.Namespace }} -l app.kubernetes.io/instance={{ .Release.Name }} -o wide + +2. Check pod logs: + + kubectl logs -n {{ .Release.Namespace }} -l app.kubernetes.io/instance={{ .Release.Name }} -f + +3. Verify health endpoint (from within the cluster): + + kubectl run -n {{ .Release.Namespace }} --rm -it --restart=Never --image=curlimages/curl:latest curl -- \ + curl -s http://{{ include "device-api-server.fullname" . }}-metrics.{{ .Release.Namespace }}.svc:{{ .Values.server.metricsPort }}/metrics | head -20 + +{{- if .Values.metrics.enabled }} + +------------------------------------------------------------------------------- + Metrics & Monitoring +------------------------------------------------------------------------------- + +Metrics endpoint: http://:{{ .Values.server.metricsPort }}/metrics + +{{- if .Values.metrics.serviceMonitor.enabled }} +ServiceMonitor: Enabled (Prometheus will auto-discover) +{{- else }} +ServiceMonitor: Disabled + To enable Prometheus auto-discovery, upgrade with: + --set metrics.serviceMonitor.enabled=true +{{- end }} + +{{- if .Values.metrics.prometheusRule.enabled }} +PrometheusRule: Enabled (alerts configured) +{{- else }} +PrometheusRule: Disabled + To enable alerting rules, upgrade with: + --set metrics.prometheusRule.enabled=true +{{- end }} +{{- end }} + +------------------------------------------------------------------------------- + Client Connection +------------------------------------------------------------------------------- + +Providers and consumers on the same node can connect via: + + - Unix Socket: unix://{{ .Values.server.unixSocket }} + +Example using grpcurl: + + # List available services (via health/admin port) + grpcurl -plaintext localhost:{{ .Values.server.healthPort }} list + + # List GPUs (via unix socket, requires grpcurl with unix support) + grpcurl -plaintext -unix {{ .Values.server.unixSocket }} \ + nvidia.device.v1alpha1.GpuService/ListGpus + +{{- if .Values.nvmlProvider.enabled }} + +------------------------------------------------------------------------------- + NVML Provider Sidecar Notes +------------------------------------------------------------------------------- + +The NVML provider sidecar requires: + 1. RuntimeClass "nvidia" must exist in the cluster + 2. NVIDIA GPU Operator or Container Toolkit installed + 3. Nodes must have NVIDIA GPUs + +Verify RuntimeClass exists: + kubectl get runtimeclass nvidia + +If not present, create it or install the NVIDIA GPU Operator: + https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/ + +{{- end }} + +------------------------------------------------------------------------------- + Support +------------------------------------------------------------------------------- + +Documentation: https://github.com/nvidia/nvsentinel +Issues: https://github.com/nvidia/nvsentinel/issues + +=============================================================================== diff --git a/deployments/helm/device-api-server/templates/_helpers.tpl b/deployments/helm/device-api-server/templates/_helpers.tpl new file mode 100644 index 000000000..8771b2ec9 --- /dev/null +++ b/deployments/helm/device-api-server/templates/_helpers.tpl @@ -0,0 +1,95 @@ +{{/* +Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +{{/* +Expand the name of the chart. +*/}} +{{- define "device-api-server.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "device-api-server.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "device-api-server.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "device-api-server.labels" -}} +helm.sh/chart: {{ include "device-api-server.chart" . }} +{{ include "device-api-server.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +app.kubernetes.io/part-of: device-api +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "device-api-server.selectorLabels" -}} +app.kubernetes.io/name: {{ include "device-api-server.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +app.kubernetes.io/component: device-api-server +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "device-api-server.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "device-api-server.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +Create the image name +*/}} +{{- define "device-api-server.image" -}} +{{- $tag := default .Chart.AppVersion .Values.image.tag -}} +{{- printf "%s:%s" .Values.image.repository $tag }} +{{- end }} + +{{/* +Socket directory path +*/}} +{{- define "device-api-server.socketDir" -}} +{{- .Values.server.unixSocket | dir }} +{{- end }} diff --git a/deployments/helm/device-api-server/templates/daemonset.yaml b/deployments/helm/device-api-server/templates/daemonset.yaml new file mode 100644 index 000000000..7143ddb5f --- /dev/null +++ b/deployments/helm/device-api-server/templates/daemonset.yaml @@ -0,0 +1,222 @@ +{{/* +Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} +{{- if not (or (hasPrefix "/var/run/" .Values.server.unixSocket) (hasPrefix "/tmp/" .Values.server.unixSocket)) }} +{{- fail "server.unixSocket must be an absolute path under /var/run/ or /tmp/" }} +{{- end }} +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "device-api-server.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "device-api-server.labels" . | nindent 4 }} +spec: + selector: + matchLabels: + {{- include "device-api-server.selectorLabels" . | nindent 6 }} + updateStrategy: + {{- toYaml .Values.updateStrategy | nindent 4 }} + template: + metadata: + annotations: + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "device-api-server.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "device-api-server.serviceAccountName" . }} + automountServiceAccountToken: {{ .Values.serviceAccount.automountServiceAccountToken }} + {{- with .Values.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.runtimeClassName }} + runtimeClassName: {{ . }} + {{- end }} + initContainers: + # Set restrictive permissions on the socket directory + - name: init-socket-dir + image: {{ include "device-api-server.image" . }} + command: ["sh", "-c", "mkdir -p {{ include "device-api-server.socketDir" . }} && chmod 0750 {{ include "device-api-server.socketDir" . }}"] + securityContext: + runAsUser: 0 + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + - name: socket-dir + mountPath: {{ include "device-api-server.socketDir" . }} + {{- with .Values.initContainers }} + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: {{ .Chart.Name }} + image: {{ include "device-api-server.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + args: + - --bind-address=unix://{{ .Values.server.unixSocket }} + - --health-probe-bind-address=:{{ .Values.server.healthPort }} + - --metrics-bind-address=:{{ .Values.server.metricsPort }} + - --shutdown-grace-period={{ .Values.server.shutdownGracePeriod }}s + - -v={{ .Values.logging.verbosity }} + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + {{- with .Values.env }} + {{- toYaml . | nindent 12 }} + {{- end }} + lifecycle: + preStop: + exec: + # Sleep to allow k8s to propagate endpoint removal + command: ["sleep", "{{ .Values.server.shutdownDelay }}"] + ports: + - name: health + containerPort: {{ .Values.server.healthPort }} + protocol: TCP + {{- if .Values.metrics.enabled }} + - name: metrics + containerPort: {{ .Values.server.metricsPort }} + protocol: TCP + {{- end }} + # Health probes use the TCP admin port (gRPC health service). + # The server's health monitor checks both storage readiness and + # service readiness before reporting SERVING, so a passing probe + # implies the device socket is functional. K8s does not support + # Unix domain socket probes natively. + {{- with .Values.livenessProbe }} + livenessProbe: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.readinessProbe }} + readinessProbe: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + volumeMounts: + - name: socket-dir + mountPath: {{ include "device-api-server.socketDir" . }} + {{- with .Values.extraVolumeMounts }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- if .Values.nvmlProvider.enabled }} + # NVML Provider sidecar container + - name: nvml-provider + image: "{{ .Values.nvmlProvider.image.repository }}:{{ .Values.nvmlProvider.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.nvmlProvider.image.pullPolicy }} + args: + - --server-address=unix://{{ .Values.server.unixSocket }} + - --provider-id={{ .Values.nvmlProvider.providerID }} + - --driver-root={{ .Values.nvmlProvider.driverRoot }} + - --health-check={{ .Values.nvmlProvider.healthCheckEnabled }} + - --health-port={{ .Values.nvmlProvider.healthPort }} + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + # NVIDIA Container Toolkit environment variables + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: NVIDIA_DRIVER_CAPABILITIES + value: "utility" + ports: + - name: provider-health + containerPort: {{ .Values.nvmlProvider.healthPort }} + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: provider-health + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /readyz + port: provider-health + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 5 + failureThreshold: 3 + {{- with .Values.nvmlProvider.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.nvmlProvider.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + volumeMounts: + - name: socket-dir + mountPath: {{ include "device-api-server.socketDir" . }} + {{- end }} + {{- with .Values.sidecars }} + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - name: socket-dir + hostPath: + path: {{ include "device-api-server.socketDir" . }} + type: DirectoryOrCreate + {{- with .Values.extraVolumes }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + # terminationGracePeriodSeconds = preStop sleep + shutdown grace period + buffer + # preStop and SIGTERM run concurrently in k8s, so we use one shutdownDelay, not two. + terminationGracePeriodSeconds: {{ add .Values.server.shutdownDelay .Values.server.shutdownGracePeriod 5 }} diff --git a/deployments/helm/device-api-server/templates/prometheusrule.yaml b/deployments/helm/device-api-server/templates/prometheusrule.yaml new file mode 100644 index 000000000..3a82faca6 --- /dev/null +++ b/deployments/helm/device-api-server/templates/prometheusrule.yaml @@ -0,0 +1,93 @@ +{{/* +Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} +{{- if and .Values.metrics.enabled .Values.metrics.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "device-api-server.fullname" . }} + namespace: {{ .Values.metrics.prometheusRule.namespace | default .Release.Namespace }} + labels: + {{- include "device-api-server.labels" . | nindent 4 }} + {{- with .Values.metrics.prometheusRule.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + - name: device-api-server + rules: + # Server availability + - alert: DeviceAPIServerDown + expr: up{job="{{ include "device-api-server.fullname" . }}-metrics"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Device API Server is down on {{ "{{ $labels.instance }}" }}" + description: "Device API Server has been unreachable for more than 5 minutes." + runbook_url: "https://github.com/nvidia/device-api/blob/main/docs/operations/device-api-server.md#alert-deviceapiserverdown" + + # High latency + - alert: DeviceAPIServerHighLatency + expr: | + histogram_quantile(0.99, + sum(rate(grpc_server_handling_seconds_bucket{ + grpc_service="nvidia.device.v1alpha1.GpuService" + }[5m])) by (le, instance) + ) > 0.5 + for: 5m + labels: + severity: warning + annotations: + summary: "Device API Server high latency on {{ "{{ $labels.instance }}" }}" + description: "P99 latency is above 500ms for more than 5 minutes." + runbook_url: "https://github.com/nvidia/device-api/blob/main/docs/operations/device-api-server.md#alert-deviceapiserverhighlatency" + + # High error rate + - alert: DeviceAPIServerHighErrorRate + expr: | + sum(rate(grpc_server_handled_total{ + grpc_code!="OK", + grpc_service=~"nvidia.device.v1alpha1.*" + }[5m])) by (instance) + / + sum(rate(grpc_server_handled_total{ + grpc_service=~"nvidia.device.v1alpha1.*" + }[5m])) by (instance) + > 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "Device API Server high error rate on {{ "{{ $labels.instance }}" }}" + description: "Error rate is above 10% for more than 5 minutes." + runbook_url: "https://github.com/nvidia/device-api/blob/main/docs/operations/device-api-server.md#alert-deviceapiserverhigherrorrate" + + # High memory usage + - alert: DeviceAPIServerHighMemory + expr: | + process_resident_memory_bytes{job="{{ include "device-api-server.fullname" . }}-metrics"} > 512 * 1024 * 1024 + for: 10m + labels: + severity: warning + annotations: + summary: "Device API Server high memory usage on {{ "{{ $labels.instance }}" }}" + description: "Memory usage is above 512MB for more than 10 minutes." + runbook_url: "https://github.com/nvidia/device-api/blob/main/docs/operations/device-api-server.md#alert-deviceapiserverhighmemory" + + {{- with .Values.metrics.prometheusRule.additionalRules }} + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/deployments/helm/device-api-server/templates/service.yaml b/deployments/helm/device-api-server/templates/service.yaml new file mode 100644 index 000000000..64ee33c40 --- /dev/null +++ b/deployments/helm/device-api-server/templates/service.yaml @@ -0,0 +1,37 @@ +{{/* +Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} +{{- if .Values.metrics.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "device-api-server.fullname" . }}-metrics + namespace: {{ .Release.Namespace }} + labels: + {{- include "device-api-server.labels" . | nindent 4 }} + {{- with .Values.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.server.metricsPort }} + targetPort: metrics + protocol: TCP + name: metrics + selector: + {{- include "device-api-server.selectorLabels" . | nindent 4 }} +{{- end }} diff --git a/deployments/helm/device-api-server/templates/serviceaccount.yaml b/deployments/helm/device-api-server/templates/serviceaccount.yaml new file mode 100644 index 000000000..e4c0a6091 --- /dev/null +++ b/deployments/helm/device-api-server/templates/serviceaccount.yaml @@ -0,0 +1,29 @@ +{{/* +Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} +{{- if .Values.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "device-api-server.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "device-api-server.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.serviceAccount.automountServiceAccountToken }} +{{- end }} diff --git a/deployments/helm/device-api-server/templates/servicemonitor.yaml b/deployments/helm/device-api-server/templates/servicemonitor.yaml new file mode 100644 index 000000000..cb378ae22 --- /dev/null +++ b/deployments/helm/device-api-server/templates/servicemonitor.yaml @@ -0,0 +1,47 @@ +{{/* +Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} +{{- if and .Values.metrics.enabled .Values.metrics.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "device-api-server.fullname" . }} + namespace: {{ .Values.metrics.serviceMonitor.namespace | default .Release.Namespace }} + labels: + {{- include "device-api-server.labels" . | nindent 4 }} + {{- with .Values.metrics.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + {{- include "device-api-server.selectorLabels" . | nindent 6 }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: metrics + interval: {{ .Values.metrics.serviceMonitor.interval }} + scrapeTimeout: {{ .Values.metrics.serviceMonitor.scrapeTimeout }} + path: /metrics + {{- with .Values.metrics.serviceMonitor.metricRelabelings }} + metricRelabelings: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.metrics.serviceMonitor.relabelings }} + relabelings: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/deployments/helm/device-api-server/values.yaml b/deployments/helm/device-api-server/values.yaml new file mode 100644 index 000000000..9c9dbb907 --- /dev/null +++ b/deployments/helm/device-api-server/values.yaml @@ -0,0 +1,255 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Default values for device-api-server. +# This is a YAML-formatted file. + +# -- Number of replicas (ignored for DaemonSet, kept for consistency) +replicaCount: 1 + +# -- Image configuration +image: + # -- Image repository + repository: ghcr.io/nvidia/device-api-server + # -- Image pull policy + pullPolicy: IfNotPresent + # -- Image tag (defaults to Chart appVersion) + tag: "" + +# -- Image pull secrets +imagePullSecrets: [] + +# -- Override the name of the chart +nameOverride: "" + +# -- Override the full name of the chart +fullnameOverride: "" + +# -- Server configuration +server: + # -- Unix socket path for gRPC API (device service) + # Must be an absolute path under /var/run/ or /tmp/. + # WARNING: Arbitrary paths may expose host filesystem risks. + unixSocket: /var/run/device-api/device.sock + # -- HTTP port for health/admin gRPC endpoints + healthPort: 8081 + # -- HTTP port for Prometheus metrics + metricsPort: 9090 + # -- Graceful shutdown grace period in seconds + shutdownGracePeriod: 25 + # -- Shutdown delay in seconds (preStop sleep for k8s endpoint propagation) + shutdownDelay: 5 + +# -- Logging configuration +logging: + # -- Log verbosity level (0=info, higher=more verbose) + verbosity: 0 + +# -- NVML Provider Sidecar configuration +# Deploys the NVML provider as a sidecar container that connects to device-api-server +# via gRPC. This provides better isolation and independent updates compared to the +# built-in nvml provider. +nvmlProvider: + # -- Enable the NVML provider sidecar container + enabled: false + # -- Image configuration for the nvml-provider sidecar + image: + # -- Image repository + repository: ghcr.io/nvidia/device-api-server + # -- Image tag (defaults to Chart appVersion) + tag: "" + # -- Image pull policy + pullPolicy: IfNotPresent + # -- gRPC address of the device-api-server (derived from server.unixSocket in daemonset template) + # Sidecar connects via shared unix socket volume. + # This value is ignored when the sidecar is enabled; the template uses server.unixSocket directly. + # -- Unique identifier for this provider instance + providerID: "nvml-provider-sidecar" + # -- Root path where NVIDIA driver libraries are located + driverRoot: /run/nvidia/driver + # -- Enable XID event monitoring for health checks + healthCheckEnabled: true + # -- HTTP port for health check endpoints + healthPort: 8082 + # -- Resource limits and requests for the sidecar + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi + # -- Security context for the sidecar container + securityContext: + runAsNonRoot: true + runAsUser: 65534 + runAsGroup: 65534 + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + +# -- RuntimeClassName for the pod +# Set to "nvidia" when nvml.enabled is true to inject NVIDIA driver libraries +# Requires the NVIDIA GPU Operator or manual RuntimeClass configuration +runtimeClassName: "" + +# -- ServiceAccount configuration +serviceAccount: + # -- Create a ServiceAccount + create: true + # -- ServiceAccount name (generated if not set) + name: "" + # -- Annotations to add to the ServiceAccount + annotations: {} + # -- Automount service account token + automountServiceAccountToken: false + +# -- RBAC configuration +rbac: + # -- Create RBAC resources + create: true + +# -- Pod annotations +podAnnotations: {} + +# -- Pod labels +podLabels: {} + +# -- Pod security context +podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + +# -- Container security context +securityContext: + runAsNonRoot: true + runAsUser: 65534 + runAsGroup: 65534 + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + +# -- Resource limits and requests +# Default limits handle the common 8-GPU case. For larger nodes, increase: +# - 8 GPUs: 500m CPU, 512Mi memory (default) +# - 16 GPUs: 1000m CPU, 1Gi memory +# Memory usage scales with: GPU count * watch event size * watcher count +resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + +# -- Node selector for scheduling +# @default -- Schedules only on GPU nodes +nodeSelector: + nvidia.com/gpu.present: "true" + +# -- Tolerations for scheduling +tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + +# -- Affinity rules +affinity: {} + +# -- Priority class name +priorityClassName: "" + +# -- Liveness probe configuration (gRPC health check on admin server) +livenessProbe: + grpc: + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + +# -- Readiness probe configuration (gRPC health check on admin server) +readinessProbe: + grpc: + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + +# -- Update strategy for the DaemonSet +updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + +# -- Service configuration (for metrics scraping) +service: + # -- Service type + type: ClusterIP + # -- Service annotations + annotations: {} + +# -- Prometheus metrics configuration +metrics: + # -- Enable metrics endpoint + enabled: true + # -- ServiceMonitor configuration (requires Prometheus Operator) + serviceMonitor: + # -- Create ServiceMonitor resource + enabled: false + # -- ServiceMonitor namespace (defaults to release namespace) + namespace: "" + # -- Additional labels for ServiceMonitor + labels: {} + # -- Scrape interval + interval: 30s + # -- Scrape timeout + scrapeTimeout: 10s + # -- Metric relabeling configs + metricRelabelings: [] + # -- Relabeling configs + relabelings: [] + # -- PrometheusRule configuration (requires Prometheus Operator) + prometheusRule: + # -- Create PrometheusRule resource + enabled: false + # -- PrometheusRule namespace (defaults to release namespace) + namespace: "" + # -- Additional labels for PrometheusRule + labels: {} + # -- Additional alerting rules + additionalRules: [] + +# -- Additional environment variables +env: [] +# - name: LOG_FORMAT +# value: json + +# -- Additional volume mounts +extraVolumeMounts: [] + +# -- Additional volumes +extraVolumes: [] + +# -- Init containers +initContainers: [] + +# -- Sidecar containers +sidecars: [] diff --git a/deployments/helm/values-sidecar-test.yaml b/deployments/helm/values-sidecar-test.yaml new file mode 100644 index 000000000..970b54d78 --- /dev/null +++ b/deployments/helm/values-sidecar-test.yaml @@ -0,0 +1,54 @@ +# Sidecar test values - validates nvml-provider sidecar architecture +# Usage: helm upgrade device-api-server deployments/helm/device-api-server -n device-api -f deployments/helm/values-sidecar-test.yaml + +image: + repository: ttl.sh/device-api-server + tag: "2h" + pullPolicy: Always + +# Disable built-in NVML provider (use sidecar instead) +nvml: + enabled: false + +# Enable NVML Provider sidecar +nvmlProvider: + enabled: true + image: + repository: ttl.sh/device-api-server-sidecar + tag: "2h" + pullPolicy: Always + providerID: "nvml-provider-sidecar" + driverRoot: /run/nvidia/driver + healthCheckEnabled: true + healthPort: 8082 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi + +# Override node selector (cluster uses node-type=gpu instead of nvidia.com/gpu.present) +# Set to null to remove the default, then add only the one we need +nodeSelector: + nvidia.com/gpu.present: null + node-type: gpu + +# RuntimeClass for NVML access +runtimeClassName: nvidia + +logging: + verbosity: 2 + +# Run as root to allow hostPath socket creation +podSecurityContext: + runAsNonRoot: false + runAsUser: 0 + runAsGroup: 0 + fsGroup: 0 + +securityContext: + runAsNonRoot: false + runAsUser: 0 + runAsGroup: 0 diff --git a/deployments/static/nvsentinel-daemonset.yaml b/deployments/static/nvsentinel-daemonset.yaml new file mode 100644 index 000000000..beb6c8d87 --- /dev/null +++ b/deployments/static/nvsentinel-daemonset.yaml @@ -0,0 +1,217 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# WARNING: These static manifests use placeholder image tags (v0.0.0). +# For production deployments, use the Helm chart with explicit image versions +# or replace v0.0.0 with a specific release tag (e.g., v1.0.0). + +# NVSentinel Static Deployment Manifest +# +# This manifest deploys the Device API Server with the NVML Provider sidecar. +# For production use, consider using the Helm chart for better configurability. +# +# Usage: +# kubectl apply -f nvsentinel-daemonset.yaml +# +# Prerequisites: +# - Kubernetes 1.25+ +# - RuntimeClass 'nvidia' configured (GPU Operator or manual setup) +# - GPU nodes labeled with 'nvidia.com/gpu.present=true' + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: nvsentinel + labels: + app.kubernetes.io/name: nvsentinel +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: nvsentinel + namespace: nvsentinel + labels: + app.kubernetes.io/name: nvsentinel +automountServiceAccountToken: false +--- +apiVersion: v1 +kind: Service +metadata: + name: nvsentinel + namespace: nvsentinel + labels: + app.kubernetes.io/name: nvsentinel +spec: + type: ClusterIP + clusterIP: None # Headless for DaemonSet + selector: + app.kubernetes.io/name: nvsentinel + ports: + - name: health + port: 8081 + targetPort: health + protocol: TCP + - name: metrics + port: 9090 + targetPort: metrics + protocol: TCP +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvsentinel + namespace: nvsentinel + labels: + app.kubernetes.io/name: nvsentinel + app.kubernetes.io/component: device-api-server +spec: + selector: + matchLabels: + app.kubernetes.io/name: nvsentinel + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + template: + metadata: + labels: + app.kubernetes.io/name: nvsentinel + app.kubernetes.io/component: device-api-server + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9090" + prometheus.io/path: "/metrics" + spec: + serviceAccountName: nvsentinel + # runtimeClassName: nvidia enables the NVIDIA Container Runtime, + # required for the nvml-provider sidecar to access GPU devices. + # This requires RuntimeClass 'nvidia' configured in the cluster + # (via NVIDIA GPU Operator or manual setup). + # See: https://kubernetes.io/docs/concepts/containers/runtime-class/ + runtimeClassName: nvidia + nodeSelector: + nvidia.com/gpu.present: "true" + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + # Device API Server - Pure Go, no NVML dependencies + - name: device-api-server + image: ghcr.io/nvidia/device-api-server:v0.0.0 # Replace with specific version for production + imagePullPolicy: IfNotPresent + args: + - --bind-address=unix:///var/run/device-api/device.sock + - --health-probe-bind-address=:8081 + - --metrics-bind-address=:9090 + - --shutdown-grace-period=25s + - -v=0 + ports: + - name: health + containerPort: 8081 + protocol: TCP + - name: metrics + containerPort: 9090 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: health + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /readyz + port: health + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + securityContext: + runAsNonRoot: true + runAsUser: 65534 + runAsGroup: 65534 + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + - name: device-api-socket + mountPath: /var/run/device-api + + # NVML Provider Sidecar - CGO binary, requires RuntimeClass nvidia + - name: nvml-provider + image: ghcr.io/nvidia/device-api-server:nvml-provider-v0.0.0 # Replace with specific version for production + imagePullPolicy: IfNotPresent + args: + - --server-address=unix:///var/run/device-api/device.sock + - --provider-id=nvml-provider + - --driver-root=/run/nvidia/driver + - --health-port=8082 + - --health-check=true + - -v=0 + ports: + - name: provider-health + containerPort: 8082 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: provider-health + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /readyz + port: provider-health + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi + securityContext: + runAsNonRoot: true + runAsUser: 65534 + runAsGroup: 65534 + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumes: + - name: device-api-socket + emptyDir: {} diff --git a/docs/api/device-api-server.md b/docs/api/device-api-server.md new file mode 100644 index 000000000..22b0c6ee9 --- /dev/null +++ b/docs/api/device-api-server.md @@ -0,0 +1,425 @@ +# Device API Server - API Reference + +This document provides the complete API reference for the Device API Server gRPC services. + +## Overview + +The Device API Server exposes a unified `GpuService` that provides both read and write operations following Kubernetes API conventions: + +| Operation Type | Methods | Clients | +|----------------|---------|---------| +| Read | `GetGpu`, `ListGpus`, `WatchGpus` | Consumers (device plugins, DRA drivers) | +| Write | `CreateGpu`, `UpdateGpu`, `UpdateGpuStatus`, `DeleteGpu` | Providers (health monitors, NVML) | + +**Package**: `nvidia.device.v1alpha1` + +**Connection Endpoints**: +- Unix Socket: `unix:///var/run/device-api/device.sock` (recommended) +- TCP: `localhost:50051` + +## GpuService + +The `GpuService` provides a unified API for GPU resource management: + +- **Read operations** (`GetGpu`, `ListGpus`, `WatchGpus`) for consumers +- **Write operations** (`CreateGpu`, `UpdateGpu`, `UpdateGpuStatus`, `DeleteGpu`) for providers + +> **Important**: Write operations acquire exclusive locks, blocking all consumer reads until completion. This prevents consumers from reading stale "healthy" states during GPU health transitions. + +### Read Operations + +### GetGpu + +Retrieves a single GPU resource by its unique name. + +```protobuf +rpc GetGpu(GetGpuRequest) returns (GetGpuResponse); +``` + +**Request**: + +| Field | Type | Description | +|-------|------|-------------| +| `name` | string | The unique resource name of the GPU | + +**Response**: + +| Field | Type | Description | +|-------|------|-------------| +| `gpu` | Gpu | The requested GPU resource | + +**Errors**: +- `NOT_FOUND`: GPU with the specified name does not exist + +**Example**: + +```bash +grpcurl -plaintext localhost:50051 \ + -d '{"name": "gpu-abc123"}' \ + nvidia.device.v1alpha1.GpuService/GetGpu +``` + +### ListGpus + +Retrieves a list of all GPU resources. + +```protobuf +rpc ListGpus(ListGpusRequest) returns (ListGpusResponse); +``` + +**Request**: Empty (reserved for future filtering/pagination) + +**Response**: + +| Field | Type | Description | +|-------|------|-------------| +| `gpu_list` | GpuList | List of all GPU resources | + +**Example**: + +```bash +grpcurl -plaintext localhost:50051 \ + nvidia.device.v1alpha1.GpuService/ListGpus +``` + +**Response Example**: + +```json +{ + "gpuList": { + "items": [ + { + "name": "gpu-abc123", + "spec": { + "uuid": "GPU-a1b2c3d4-e5f6-a7b8-c9d0-e1f2a3b4c5d6" + }, + "status": { + "conditions": [ + { + "type": "Ready", + "status": "True", + "lastTransitionTime": "2026-01-21T10:00:00Z", + "reason": "GPUHealthy", + "message": "GPU is healthy and available" + } + ] + }, + "resourceVersion": "42" + } + ] + } +} +``` + +### WatchGpus + +Streams lifecycle events for GPU resources. The stream remains open until the client disconnects or an error occurs. + +```protobuf +rpc WatchGpus(WatchGpusRequest) returns (stream WatchGpusResponse); +``` + +**Request**: Empty (reserved for future filtering/resumption) + +**Response Stream**: + +| Field | Type | Description | +|-------|------|-------------| +| `type` | string | Event type: `ADDED`, `MODIFIED`, `DELETED`, `ERROR` | +| `object` | Gpu | The GPU resource (last known state for DELETED) | + +**Event Types**: + +| Type | Description | +|------|-------------| +| `ADDED` | GPU was registered or first observed | +| `MODIFIED` | GPU status was updated | +| `DELETED` | GPU was unregistered | +| `ERROR` | An error occurred in the watch stream | + +**Example**: + +```bash +grpcurl -plaintext localhost:50051 \ + nvidia.device.v1alpha1.GpuService/WatchGpus +``` + +**Behavior**: +- On connection, receives `ADDED` events for all existing GPUs +- Subsequent events reflect real-time changes +- Stream is per-client; multiple clients can watch simultaneously + +### Write Operations + +#### CreateGpu + +Creates a new GPU resource. This is the standard way for providers to register GPUs. + +```protobuf +rpc CreateGpu(CreateGpuRequest) returns (CreateGpuResponse); +``` + +**Request**: + +| Field | Type | Description | +|-------|------|-------------| +| `gpu` | Gpu | The GPU to create (metadata.name and spec.uuid required) | + +**Response**: + +| Field | Type | Description | +|-------|------|-------------| +| `gpu` | Gpu | The created GPU with server-assigned fields | +| `created` | bool | True if new GPU was created, false if already existed | + +**Errors**: +- `INVALID_ARGUMENT`: Required fields missing + +**Behavior**: +- If GPU already exists, returns existing GPU (idempotent) +- Triggers `ADDED` event for active watch streams + +**Example**: + +```bash +grpcurl -plaintext localhost:50051 \ + -d '{ + "gpu": { + "metadata": {"name": "gpu-abc123"}, + "spec": {"uuid": "GPU-a1b2c3d4-e5f6-a7b8-c9d0-e1f2a3b4c5d6"} + } + }' \ + nvidia.device.v1alpha1.GpuService/CreateGpu +``` + +#### UpdateGpu + +Replaces an entire GPU resource (spec and status). + +```protobuf +rpc UpdateGpu(UpdateGpuRequest) returns (Gpu); +``` + +**Request**: + +| Field | Type | Description | +|-------|------|-------------| +| `gpu` | Gpu | The GPU to update (metadata.name required) | + +**Response**: The updated GPU resource. + +**Errors**: +- `NOT_FOUND`: GPU does not exist +- `ABORTED`: Resource version conflict (optimistic concurrency) + +**Behavior**: +- Uses optimistic concurrency via `resource_version` +- Triggers `MODIFIED` event for active watch streams + +#### UpdateGpuStatus + +Updates only the status of an existing GPU (follows Kubernetes subresource pattern). + +```protobuf +rpc UpdateGpuStatus(UpdateGpuStatusRequest) returns (Gpu); +``` + +**Request**: + +| Field | Type | Description | +|-------|------|-------------| +| `name` | string | The GPU name to update | +| `status` | GpuStatus | New status (completely replaces existing) | +| `resource_version` | int64 | Optional: expected version for conflict detection | + +**Response**: The updated GPU resource. + +**Errors**: +- `NOT_FOUND`: GPU does not exist +- `ABORTED`: Resource version conflict (optimistic concurrency) + +**Locking**: Acquires exclusive write lock, blocking all reads. + +**Example** (mark GPU unhealthy due to XID error): + +```bash +grpcurl -plaintext localhost:50051 \ + -d '{ + "name": "gpu-abc123", + "status": { + "conditions": [{ + "type": "Ready", + "status": "False", + "reason": "XidError", + "message": "Critical XID error 79 detected" + }] + } + }' \ + nvidia.device.v1alpha1.GpuService/UpdateGpuStatus +``` + +#### DeleteGpu + +Removes a GPU from the server. + +```protobuf +rpc DeleteGpu(DeleteGpuRequest) returns (google.protobuf.Empty); +``` + +**Request**: + +| Field | Type | Description | +|-------|------|-------------| +| `name` | string | Unique identifier of GPU to remove | + +**Response**: Empty on success. + +**Errors**: +- `NOT_FOUND`: GPU does not exist + +**Behavior**: +- GPU will no longer appear in ListGpus/GetGpu responses +- Triggers `DELETED` event for active watch streams + +**Example**: + +```bash +grpcurl -plaintext localhost:50051 \ + -d '{"name": "gpu-abc123"}' \ + nvidia.device.v1alpha1.GpuService/DeleteGpu +``` + +--- + +## Resource Types + +### Gpu + +The main GPU resource following the Kubernetes Resource Model pattern. + +| Field | Type | Description | +|-------|------|-------------| +| `name` | string | Unique logical identifier | +| `spec` | GpuSpec | Identity and desired attributes | +| `status` | GpuStatus | Most recently observed state | +| `resource_version` | int64 | Monotonically increasing version | + +### GpuSpec + +Defines the identity of a GPU. + +| Field | Type | Description | +|-------|------|-------------| +| `uuid` | string | Physical hardware UUID (e.g., `GPU-a1b2c3d4-...`) | + +### GpuStatus + +Contains the observed state of a GPU. + +| Field | Type | Description | +|-------|------|-------------| +| `conditions` | Condition[] | Current state observations | +| `recommended_action` | string | Suggested resolution for negative states | + +### Condition + +Describes one aspect of the GPU's current state. + +| Field | Type | Description | +|-------|------|-------------| +| `type` | string | Category (e.g., `Ready`, `MemoryHealthy`) | +| `status` | string | `True`, `False`, or `Unknown` | +| `last_transition_time` | Timestamp | When status last changed | +| `reason` | string | Machine-readable reason (UpperCamelCase) | +| `message` | string | Human-readable details | + +**Standard Condition Types**: + +| Type | Description | +|------|-------------| +| `Ready` | Overall GPU health and availability | +| `MemoryHealthy` | GPU memory is functioning correctly | +| `ThermalHealthy` | GPU temperature is within safe limits | + +--- + +## Go Client Example + +```go +package main + +import ( + "context" + "log" + + v1alpha1 "github.com/nvidia/nvsentinel/api/gen/go/device/v1alpha1" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" +) + +func main() { + // Connect via Unix socket (recommended) + conn, err := grpc.NewClient( + "unix:///var/run/device-api/device.sock", + grpc.WithTransportCredentials(insecure.NewCredentials()), + ) + if err != nil { + log.Fatalf("failed to connect: %v", err) + } + defer conn.Close() + + client := v1alpha1.NewGpuServiceClient(conn) + + // Consumer: List GPUs + resp, err := client.ListGpus(context.Background(), &v1alpha1.ListGpusRequest{}) + if err != nil { + log.Fatalf("failed to list GPUs: %v", err) + } + + for _, gpu := range resp.GpuList.Items { + log.Printf("GPU: %s, Version: %d", gpu.Metadata.Name, gpu.Metadata.ResourceVersion) + for _, cond := range gpu.Status.Conditions { + log.Printf(" Condition: %s=%s (%s)", cond.Type, cond.Status, cond.Reason) + } + } + + // Provider: Update GPU status + _, err = client.UpdateGpuStatus(context.Background(), + &v1alpha1.UpdateGpuStatusRequest{ + Gpu: &v1alpha1.Gpu{ + Metadata: &v1alpha1.ObjectMeta{Name: "gpu-abc123"}, + Status: &v1alpha1.GpuStatus{ + Conditions: []*v1alpha1.Condition{{ + Type: "Ready", + Status: "False", + Reason: "XidError", + Message: "Critical XID 79 detected", + }}, + }, + }, + }) + if err != nil { + log.Fatalf("failed to update status: %v", err) + } +} +``` + +--- + +## Error Codes + +| Code | Meaning | +|------|---------| +| `NOT_FOUND` | GPU with specified name does not exist | +| `INVALID_ARGUMENT` | Request contains invalid parameters | +| `ABORTED` | Resource version conflict (optimistic concurrency) | +| `INTERNAL` | Server-side error occurred | +| `UNAVAILABLE` | Server is temporarily unavailable | + +--- + +## See Also + +- [Operations Guide](../operations/device-api-server.md) +- [Design Document](../design/device-api-server.md) +- [NVML Fallback Provider](../design/nvml-fallback-provider.md) diff --git a/docs/design/device-api-server.md b/docs/design/device-api-server.md new file mode 100644 index 000000000..89f159241 --- /dev/null +++ b/docs/design/device-api-server.md @@ -0,0 +1,695 @@ +# Device API Server - Design & Implementation Plan + +> **Status**: Draft +> **Author**: NVSentinel Team +> **Created**: 2026-01-21 + +## Table of Contents + +- [Executive Summary](#executive-summary) +- [Architecture Overview](#architecture-overview) +- [Design Decisions](#design-decisions) +- [Implementation Phases](#implementation-phases) +- [Directory Structure](#directory-structure) +- [API Design](#api-design) +- [Observability](#observability) +- [Deployment](#deployment) + +## Related Documents + +- [Implementation Tasks](./device-api-server-tasks.md) - Detailed task breakdown +- [NVML Fallback Provider](./nvml-fallback-provider.md) - Built-in NVML health provider design + +--- + +## Executive Summary + +The Device API Server is a **node-local gRPC cache server** deployed as a Kubernetes DaemonSet. It acts as an intermediary between: + +- **Providers** (e.g., NVSentinel health monitors) that update GPU device states +- **Consumers** (e.g., Device Plugins, DRA Drivers) that read device states for scheduling decisions + +### Key Requirements + +| Requirement | Description | +|-------------|-------------| +| Node-local | DaemonSet running on each GPU node | +| Read-blocking semantics | MUST block reads during provider updates to prevent stale data | +| Multiple providers | Support multiple health monitors updating different conditions | +| Multiple consumers | Support multiple readers (device-plugin, DRA driver, etc.) | +| Kubernetes patterns | klog/v2, structured logging, health probes | +| Helm-only deployment | No kustomize, pure Helm chart | +| Observability | Prometheus metrics, alerting rules | + +--- + +## Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Kubernetes Node │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────────────┐ ┌──────────────────────────────┐ │ +│ │ NVSentinel │ │ Device Plugin / DRA │ │ +│ │ (Health Monitor) │ │ Driver │ │ +│ │ [Provider] │ │ [Consumer] │ │ +│ └──────────┬───────────┘ └──────────────┬───────────────┘ │ +│ │ │ │ +│ │ UpdateGpuStatus() │ GetGpu() │ +│ │ (gRPC) │ ListGpus() │ +│ │ │ WatchGpus() │ +│ ▼ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ Device API Server (DaemonSet) │ │ +│ │ ┌────────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ gRPC Server │ │ │ +│ │ │ ┌────────────────────────────────────────────────────────────┐ │ │ │ +│ │ │ │ GpuService (Unified) │ │ │ │ +│ │ │ │ Write: CreateGpu, UpdateGpu, UpdateGpuStatus, DeleteGpu │ │ │ │ +│ │ │ │ Read: GetGpu, ListGpus, WatchGpus │ │ │ │ +│ │ │ └────────────────────────────────┬───────────────────────────┘ │ │ │ +│ │ │ │ │ │ │ +│ │ │ ▼ │ │ │ +│ │ │ ┌─────────────────────────────────────────────────────────────┐ │ │ │ +│ │ │ │ Cache Layer │ │ │ │ +│ │ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ │ +│ │ │ │ │ sync.RWMutex (Writer-Preference) │ │ │ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ │ │ │ Write Lock() ──────────► Blocks ALL new RLock() │ │ │ │ │ +│ │ │ │ │ until write completes │ │ │ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ │ │ │ This ensures consumers NEVER read stale data when │ │ │ │ │ +│ │ │ │ │ a provider is updating (healthy → unhealthy) │ │ │ │ │ +│ │ │ │ └───────────────────────────────────────────────────────┘ │ │ │ │ +│ │ │ │ │ │ │ │ +│ │ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ │ +│ │ │ │ │ map[string]*Gpu (In-Memory Store) │ │ │ │ │ +│ │ │ │ └───────────────────────────────────────────────────────┘ │ │ │ │ +│ │ │ └─────────────────────────────────────────────────────────────┘ │ │ │ +│ │ │ │ │ │ +│ │ │ ┌─────────────────────────────────────────────────────────────┐ │ │ │ +│ │ │ │ Watch Broadcaster │ │ │ │ +│ │ │ │ Notifies all WatchGpus() streams on state changes │ │ │ │ +│ │ │ └─────────────────────────────────────────────────────────────┘ │ │ │ +│ │ └────────────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────────────┐ │ │ +│ │ │ Health │ │ Metrics │ │ Unix Socket │ │ │ +│ │ │ :8081 │ │ :9090 │ │ /var/run/device-api/device.sock │ │ │ +│ │ │ /healthz │ │ /metrics │ │ (node-local gRPC) │ │ │ +│ │ │ /readyz │ │ │ │ │ │ │ +│ │ └─────────────┘ └─────────────┘ └─────────────────────────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +### Data Flow: Read-Blocking Semantics + +``` +Timeline ──────────────────────────────────────────────────────────────────────────► + +Provider (NVSentinel) Cache (RWMutex) Consumer (Device Plugin) + │ │ │ + │ │◄──── RLock() ────────────────┤ GetGpu() + │ │ (allowed) │ + │ │──────────────────────────────►│ Returns data + │ │ RUnlock() │ + │ │ │ + │──── UpdateGpuStatus() ──────►│ │ + │ Lock() requested │ │ + │ │ │ + │ │◄──── RLock() ────────────────┤ GetGpu() + │ │ BLOCKED ⛔ │ (waits) + │ │ │ + │◄──── Lock() acquired ────────│ │ + │ (write in progress) │ │ + │ │ │ + │──── Update complete ────────►│ │ + │ Unlock() │ │ + │ │ │ + │ │──── RLock() allowed ─────────►│ + │ │ (fresh data) │ + │ │ │ + +⚠️ CRITICAL: Consumer NEVER reads stale "healthy" state when provider + is updating to "unhealthy". The RWMutex writer-preference ensures + new readers block once a write is pending. +``` + +--- + +## Design Decisions + +### D1: Read-Blocking vs Eventually Consistent + +| Option | Pros | Cons | Decision | +|--------|------|------|----------| +| **sync.RWMutex (writer-preference)** | Prevents stale reads; simple; Go-native | Readers blocked during writes | ✅ **Selected** | +| atomic.Value + copy-on-write | Never blocks readers | Readers may see stale data during update | ❌ Rejected | +| sync.Map | Good for read-heavy | No blocking semantics; may read stale | ❌ Rejected | + +**Rationale**: The requirement explicitly states "MUST block reads, preventing false positives when a node 'was' healthy, and the next state is unhealthy." This mandates write-blocking reads. + +### D2: Transport Protocol + +| Option | Pros | Cons | Decision | +|--------|------|------|----------| +| **Unix Socket** | Node-local only; no network exposure; fast | Pod must mount socket path | ✅ **Primary** | +| TCP localhost | Easy client setup | Requires port allocation | ✅ **Secondary** | +| hostNetwork + TCP | Accessible from host | Security risk | ❌ Rejected | + +**Rationale**: Unix socket provides security isolation and performance for node-local communication. TCP fallback for flexibility. + +### D3: Provider Registration Model + +| Option | Pros | Cons | Decision | +|--------|------|------|----------| +| **Implicit (any caller can update)** | Simple; stateless server | No provider identity tracking | ✅ **Phase 1** | +| Explicit registration | Track providers; detect failures | More complexity | 🔮 **Phase 2** | + +### D4: Logging Framework + +| Option | Pros | Cons | Decision | +|--------|------|------|----------| +| **klog/v2** | Kubernetes native; contextual logging; JSON format | Slightly verbose API | ✅ **Selected** | +| zap | Fast; popular | Not Kubernetes native | ❌ Rejected | +| logr | Interface-based | Needs backend anyway | Used via klog | + +--- + +## Implementation Phases + +### Phase 1: Core Server Foundation + +**Goal**: Minimal viable gRPC server with cache and blocking semantics. + +| Task ID | Task | Description | Estimate | +|---------|------|-------------|----------| +| P1.1 | Project scaffolding | Create `cmd/device-api-server/`, `internal/` structure | S | +| P1.2 | Proto extensions | Add provider-side RPCs (UpdateGpuStatus, RegisterGpu, UnregisterGpu) | M | +| P1.3 | Cache implementation | Thread-safe cache with RWMutex, writer-preference blocking | M | +| P1.4 | Consumer gRPC service | Implement GetGpu, ListGpus, WatchGpus (read path) | M | +| P1.5 | Provider gRPC service | Implement UpdateGpuStatus, RegisterGpu, UnregisterGpu (write path) | M | +| P1.6 | Watch broadcaster | Fan-out changes to all active WatchGpus streams | M | +| P1.7 | Graceful shutdown | SIGTERM handling, drain connections, health status | S | +| P1.8 | Unit tests | Cache tests, service tests, blocking behavior tests | L | + +**Deliverables**: +- Working gRPC server binary +- Consumer and Provider services +- Basic health endpoint + +--- + +### Phase 2: Kubernetes Integration + +**Goal**: Production-ready DaemonSet with proper k8s integration. + +| Task ID | Task | Description | Estimate | +|---------|------|-------------|----------| +| P2.1 | klog/v2 integration | Structured logging, contextual loggers, log levels | M | +| P2.2 | Health probes | gRPC health protocol, HTTP /healthz /readyz endpoints | M | +| P2.3 | Configuration | Flags, environment variables, config validation | S | +| P2.4 | Unix socket support | Listen on configurable socket path | S | +| P2.5 | Signal handling | Proper SIGTERM/SIGINT handling per k8s lifecycle | S | +| P2.6 | Integration tests | Test with mock providers/consumers | L | + +**Deliverables**: +- Kubernetes-ready binary +- Health endpoints +- Configurable via flags/env + +--- + +### Phase 3: Observability + +**Goal**: Full observability stack with metrics and alerts. + +| Task ID | Task | Description | Estimate | +|---------|------|-------------|----------| +| P3.1 | Prometheus metrics | Request counts, latencies, cache stats, connection counts | M | +| P3.2 | gRPC interceptors | grpc-prometheus interceptors for all RPCs | M | +| P3.3 | Custom metrics | `device_api_server_gpus_total`, `_unhealthy`, `_cache_*` | M | +| P3.4 | Metrics endpoint | HTTP /metrics on separate port | S | +| P3.5 | Alerting rules | PrometheusRule CRD for critical alerts | M | +| P3.6 | Grafana dashboard | JSON dashboard for visualization | M | + +**Metrics to implement**: + +``` +# Server metrics +device_api_server_info{version="...", go_version="..."} +device_api_server_up + +# Cache metrics +device_api_server_cache_gpus_total +device_api_server_cache_gpus_healthy +device_api_server_cache_gpus_unhealthy +device_api_server_cache_updates_total{provider="..."} +device_api_server_cache_lock_wait_seconds_bucket + +# gRPC metrics (via interceptor) +grpc_server_started_total{grpc_service, grpc_method} +grpc_server_handled_total{grpc_service, grpc_method, grpc_code} +grpc_server_handling_seconds_bucket{grpc_service, grpc_method} + +# Watch metrics +device_api_server_watch_streams_active +device_api_server_watch_events_total{type="ADDED|MODIFIED|DELETED"} +``` + +**Alerts**: + +```yaml +- alert: DeviceAPIServerDown + expr: up{job="device-api-server"} == 0 + for: 5m + +- alert: DeviceAPIServerHighLatency + expr: histogram_quantile(0.99, grpc_server_handling_seconds_bucket) > 0.5 + for: 5m + +- alert: DeviceAPIServerUnhealthyGPUs + expr: device_api_server_cache_gpus_unhealthy > 0 + for: 1m +``` + +--- + +### Phase 4: Helm Chart + +**Goal**: Production-ready Helm chart with all configurations. + +| Task ID | Task | Description | Estimate | +|---------|------|-------------|----------| +| P4.1 | Chart scaffolding | `charts/device-api-server/` structure | S | +| P4.2 | DaemonSet template | Node selector, tolerations, resource limits | M | +| P4.3 | RBAC templates | ServiceAccount, Role, RoleBinding | M | +| P4.4 | ConfigMap/Secret | Server configuration, TLS certs | M | +| P4.5 | Service templates | Headless service, metrics service | S | +| P4.6 | PrometheusRule | Alerting rules as k8s resource | M | +| P4.7 | ServiceMonitor | Prometheus scrape configuration | S | +| P4.8 | Values schema | JSON schema for values validation | M | +| P4.9 | Chart tests | Helm test hooks | M | +| P4.10 | Documentation | README, NOTES.txt, examples | M | + +**Chart Structure**: + +``` +charts/device-api-server/ +├── Chart.yaml +├── values.yaml +├── values.schema.json +├── README.md +├── templates/ +│ ├── _helpers.tpl +│ ├── daemonset.yaml +│ ├── serviceaccount.yaml +│ ├── role.yaml +│ ├── rolebinding.yaml +│ ├── configmap.yaml +│ ├── service.yaml +│ ├── service-metrics.yaml +│ ├── servicemonitor.yaml +│ ├── prometheusrule.yaml +│ ├── poddisruptionbudget.yaml +│ └── NOTES.txt +└── tests/ + └── test-connection.yaml +``` + +--- + +### Phase 5: Documentation & Polish + +**Goal**: Comprehensive documentation and production hardening. + +| Task ID | Task | Description | Estimate | +|---------|------|-------------|----------| +| P5.1 | Architecture docs | Design document, diagrams | M | +| P5.2 | API reference | Proto documentation, examples | M | +| P5.3 | Operations guide | Deployment, troubleshooting, runbooks | L | +| P5.4 | Developer guide | Contributing, local development | M | +| P5.5 | Security hardening | TLS, authentication review | M | +| P5.6 | Performance testing | Benchmark under load | L | +| P5.7 | CI/CD pipeline | GitHub Actions for build, test, release | M | + +--- + +## Directory Structure + +Following the [kubernetes-sigs/node-feature-discovery](https://github.com/kubernetes-sigs/node-feature-discovery) pattern +where the `api/` is a standalone module and `pkg/` contains public library code: + +``` +NVSentinel/ +├── api/ # STANDALONE API MODULE (own go.mod) +│ ├── gen/go/device/v1alpha1/ # Generated Go code +│ │ ├── gpu.pb.go +│ │ └── gpu_grpc.pb.go +│ ├── proto/device/v1alpha1/ # Proto definitions +│ │ └── gpu.proto # Unified GpuService (CRUD operations) +│ ├── go.mod # module github.com/nvidia/nvsentinel/api +│ ├── go.sum +│ └── Makefile +├── cmd/ # Command entry points (thin) +│ └── device-api-server/ +│ └── main.go # Server entrypoint only +├── pkg/ # PUBLIC LIBRARY CODE (importable) +│ ├── deviceapiserver/ # Device API Server implementation +│ │ ├── cache/ # Thread-safe GPU cache +│ │ │ ├── cache.go +│ │ │ ├── cache_test.go +│ │ │ └── broadcaster.go +│ │ ├── service/ # gRPC service implementation +│ │ │ └── gpu_service.go # GpuService (unified read/write) +│ │ ├── nvml/ # NVML provider (uses gRPC client) +│ │ │ ├── provider.go +│ │ │ ├── enumerator.go +│ │ │ └── health_monitor.go +│ │ ├── metrics/ # Prometheus metrics +│ │ └── health/ # Health check handlers +│ ├── version/ # Version information +│ │ └── version.go +│ └── signals/ # Signal handling utilities +├── charts/ # Helm charts +│ └── device-api-server/ +│ ├── Chart.yaml +│ ├── values.yaml +│ └── templates/ +├── docs/ +│ ├── design/ +│ ├── api/ +│ └── operations/ +├── hack/ # Build/development scripts +├── test/ # E2E tests +├── go.mod # Root module with replace directive +├── go.sum +└── Makefile +``` + +**Key Layout Decisions:** + +| Directory | Purpose | Importable | +|-----------|---------|------------| +| `api/` | Standalone API module for versioning | Yes (own module) | +| `pkg/` | Public library code | Yes | +| `cmd/` | Thin entry points | No | +| `charts/` | Helm deployment | N/A | + +Root `go.mod` uses: `replace github.com/nvidia/nvsentinel/api => ./api` + +--- + +## API Design + +### Unified GpuService + +Following Kubernetes API conventions, the API is consolidated into a single `GpuService` with standard CRUD methods: + +```protobuf +// GpuService provides a unified API for managing GPU resources. +// +// Read operations (Get, List, Watch) are intended for consumers. +// Write operations (Create, Update, UpdateStatus, Delete) are intended for providers. +service GpuService { + // Read Operations + rpc GetGpu(GetGpuRequest) returns (Gpu); + rpc ListGpus(ListGpusRequest) returns (ListGpusResponse); + rpc WatchGpus(WatchGpusRequest) returns (stream WatchGpusResponse); + + // Write Operations + rpc CreateGpu(CreateGpuRequest) returns (CreateGpuResponse); + rpc UpdateGpu(UpdateGpuRequest) returns (Gpu); + rpc UpdateGpuStatus(UpdateGpuStatusRequest) returns (Gpu); + rpc DeleteGpu(DeleteGpuRequest) returns (google.protobuf.Empty); +} + +message CreateGpuRequest { + Gpu gpu = 1; // metadata.name and spec.uuid required +} + +message CreateGpuResponse { + Gpu gpu = 1; + bool created = 2; // true if new, false if already existed +} + +message UpdateGpuRequest { + Gpu gpu = 1; // includes resource_version for optimistic concurrency +} + +message UpdateGpuStatusRequest { + string name = 1; + GpuStatus status = 2; + int64 resource_version = 3; // optional, for conflict detection +} + +message DeleteGpuRequest { + string name = 1; +} +``` + +**Design Rationale**: +- Single service simplifies API surface and tooling compatibility +- Standard CRUD verbs enable better integration with Kubernetes patterns +- `UpdateGpuStatus` follows the Kubernetes subresource pattern +- Optimistic concurrency via `resource_version` prevents lost updates + +--- + +## Observability + +### Metrics Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Device API Server │ +│ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ gRPC Interceptors │ │ +│ │ grpc_server_started_total │ │ +│ │ grpc_server_handled_total │ │ +│ │ grpc_server_handling_seconds_bucket │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ Custom Metrics │ │ +│ │ device_api_server_cache_gpus_total │ │ +│ │ device_api_server_cache_lock_contention_total │ │ +│ │ device_api_server_watch_streams_active │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ Go Runtime Metrics │ │ +│ │ go_goroutines │ │ +│ │ go_memstats_alloc_bytes │ │ +│ │ process_cpu_seconds_total │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ :9090/metrics │ +│ │ │ +└──────────────────────────────┼───────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Prometheus │ +│ │ +│ ServiceMonitor ──► scrape_configs │ +│ │ +│ PrometheusRule ──► alerting_rules │ +│ │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Grafana │ +│ │ +│ Dashboard: Device API Server Overview │ +│ - Request rate / error rate │ +│ - P50/P99 latency │ +│ - GPU health summary │ +│ - Cache statistics │ +│ - Active watch streams │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Deployment + +### Helm Values (Key Configuration) + +```yaml +# values.yaml +replicaCount: 1 # DaemonSet ignores this, but kept for consistency + +image: + repository: ghcr.io/nvidia/device-api-server + tag: "" # Defaults to Chart appVersion + pullPolicy: IfNotPresent + +# Server configuration +server: + # gRPC listen address (TCP) - localhost only by default for security + # Set to ":50051" to bind to all interfaces (WARNING: unauthenticated API) + grpcAddress: "127.0.0.1:50051" + # Unix socket path (primary for node-local) + unixSocket: /var/run/device-api/device.sock + # Health probe port + healthPort: 8081 + # Metrics port + metricsPort: 9090 + +# Logging +logging: + # Log level (0=info, higher=more verbose) + verbosity: 0 + # Output format: text, json + format: json + +# Node selection +nodeSelector: + nvidia.com/gpu.present: "true" + +tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + +resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +# Security +securityContext: + runAsNonRoot: true + runAsUser: 65534 + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + +# RBAC +serviceAccount: + create: true + name: "" + automountServiceAccountToken: false + +rbac: + create: true + +# Observability +metrics: + enabled: true + serviceMonitor: + enabled: true + interval: 30s + scrapeTimeout: 10s + prometheusRule: + enabled: true + +# Health probes +probes: + liveness: + initialDelaySeconds: 5 + periodSeconds: 10 + readiness: + initialDelaySeconds: 5 + periodSeconds: 10 +``` + +### DaemonSet Topology + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Kubernetes Cluster │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌───────────────────────┐ ┌───────────────────────┐ ┌───────────────────────┐│ +│ │ GPU Node 1 │ │ GPU Node 2 │ │ GPU Node 3 ││ +│ │ │ │ │ │ ││ +│ │ ┌─────────────────┐ │ │ ┌─────────────────┐ │ │ ┌─────────────────┐ ││ +│ │ │ device-api- │ │ │ │ device-api- │ │ │ │ device-api- │ ││ +│ │ │ server pod │ │ │ │ server pod │ │ │ │ server pod │ ││ +│ │ │ │ │ │ │ │ │ │ │ │ ││ +│ │ │ GPU-0: Healthy │ │ │ │ GPU-0: Healthy │ │ │ │ GPU-0: Unhealthy│ ││ +│ │ │ GPU-1: Healthy │ │ │ │ GPU-1: Healthy │ │ │ │ GPU-1: Healthy │ ││ +│ │ │ GPU-2: Healthy │ │ │ │ │ │ │ │ GPU-2: Healthy │ ││ +│ │ │ GPU-3: Healthy │ │ │ │ │ │ │ │ GPU-3: Healthy │ ││ +│ │ └─────────────────┘ │ │ └─────────────────┘ │ │ └─────────────────┘ ││ +│ │ │ │ │ │ ││ +│ │ /var/run/device-api/ │ │ /var/run/device-api/ │ │ /var/run/device-api/ ││ +│ │ device.sock │ │ device.sock │ │ device.sock ││ +│ │ │ │ │ │ ││ +│ └───────────────────────┘ └───────────────────────┘ └───────────────────────┘│ +│ │ +│ ┌───────────────────────┐ │ +│ │ Non-GPU Node │ (DaemonSet does NOT schedule here due to │ +│ │ (No GPU) │ nodeSelector: nvidia.com/gpu.present=true) │ +│ └───────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Risk Assessment + +| Risk | Impact | Likelihood | Mitigation | +|------|--------|------------|------------| +| Cache corruption on concurrent writes | High | Low | RWMutex provides exclusivity | +| Watch stream memory leak | Medium | Medium | Bounded channels, timeouts | +| Provider not updating (stale data) | High | Medium | Health checks, provider heartbeat (Phase 2) | +| Socket permission issues | Medium | Medium | Init container for socket dir | +| High lock contention | Medium | Low | Metrics to detect, sharding if needed | + +--- + +## Success Criteria + +### Phase 1 +- [ ] Server starts and accepts gRPC connections +- [ ] Provider can register/update/unregister GPUs +- [ ] Consumer can Get/List/Watch GPUs +- [ ] Read-blocking verified under concurrent load + +### Phase 2 +- [ ] Structured logs with klog/v2 +- [ ] Health probes pass in Kubernetes +- [ ] Unix socket communication works + +### Phase 3 +- [ ] Prometheus metrics exposed +- [ ] Grafana dashboard visualizes key metrics +- [ ] Alerts fire correctly in test scenarios + +### Phase 4 +- [ ] `helm install` works out of box +- [ ] DaemonSet schedules on GPU nodes only +- [ ] RBAC properly scoped + +### Phase 5 +- [ ] Documentation complete +- [ ] CI/CD pipeline green +- [ ] Performance benchmarks pass + +--- + +## Appendix: Research References + +1. **Kubernetes DaemonSet gRPC Best Practices** - Health probes, graceful shutdown, load balancing +2. **Go sync.RWMutex** - Writer-preference semantics, blocking behavior +3. **klog/v2** - Structured logging, contextual logging, JSON format +4. **Helm Chart Best Practices** - RBAC, ServiceAccount, DaemonSet templates +5. **grpc-prometheus** - Metrics interceptors, histogram configuration + +--- + +*Document version: 1.0* +*Last updated: 2026-01-21* diff --git a/docs/operations/device-api-server.md b/docs/operations/device-api-server.md new file mode 100644 index 000000000..96df4804a --- /dev/null +++ b/docs/operations/device-api-server.md @@ -0,0 +1,358 @@ +# Device API Server - Operations Guide + +This guide covers deployment, configuration, monitoring, and troubleshooting of the Device API Server. + +## Architecture Overview + +The Device API Server is a pure Go gRPC server with no hardware dependencies. +GPU enumeration and health monitoring is provided by external providers (sidecars). + +``` +┌─────────────────────────────────────────────────────────────┐ +│ GPU Node │ +│ ┌─────────────────────────────────────────────────────────┐│ +│ │ Device API Server (DaemonSet) ││ +│ │ ┌─────────────────────────────────────────────────┐ ││ +│ │ │ GpuService (unified) │ ││ +│ │ │ Read: GetGpu, ListGpus, WatchGpus │ ││ +│ │ │ Write: CreateGpu, UpdateGpuStatus, DeleteGpu │ ││ +│ │ └────────────────────┬────────────────────────────┘ ││ +│ │ │ ││ +│ │ ▼ ││ +│ │ ┌─────────────────────────────────────────────────────┐││ +│ │ │ GPU Cache (RWMutex) │││ +│ │ │ - Read-blocking during writes │││ +│ │ │ - Watch event broadcasting │││ +│ │ └─────────────────────────────────────────────────────┘││ +│ └─────────────────────────────────────────────────────────┘│ +│ │ +│ Providers (gRPC clients): │ +│ - nvml-provider sidecar (GPU enumeration, XID monitoring) │ +│ - Custom providers (CreateGpu, UpdateGpuStatus) │ +│ │ +│ Consumers (gRPC clients): │ +│ - Device plugins (GetGpu, ListGpus, WatchGpus) │ +│ - DRA drivers (GetGpu, ListGpus, WatchGpus) │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Deployment + +### Prerequisites + +- Kubernetes 1.25+ +- Helm 3.0+ +- GPU nodes with label `nvidia.com/gpu.present=true` +- (Optional) Prometheus Operator for monitoring + +### Installation + +**Basic Installation**: + +```bash +helm install device-api-server ./deployments/helm/device-api-server \ + --namespace device-api --create-namespace +``` + +**With Prometheus Monitoring**: + +```bash +helm install device-api-server ./deployments/helm/device-api-server \ + --namespace device-api --create-namespace \ + --set metrics.serviceMonitor.enabled=true \ + --set metrics.prometheusRule.enabled=true +``` + +### Verify Installation + +```bash +# Check DaemonSet status +kubectl get daemonset -n device-api + +# Check pods are running on GPU nodes +kubectl get pods -n device-api -o wide + +# Check logs +kubectl logs -n device-api -l app.kubernetes.io/name=device-api-server +``` + +--- + +## Configuration + +### Command-Line Flags + +| Flag | Default | Description | +|------|---------|-------------| +| `--bind-address` | `unix:///var/run/nvidia-device-api/device-api.sock` | Unix socket URI for the gRPC device API | +| `--health-probe-bind-address` | `:50051` | TCP address for gRPC health and reflection | +| `--metrics-bind-address` | `:9090` | TCP address for HTTP Prometheus metrics | +| `--shutdown-grace-period` | `25s` | Maximum time to wait for graceful shutdown | +| `--hostname-override` | (auto-detected) | Override the node hostname (must be a valid DNS subdomain) | +| `-v` | `0` | Log verbosity level (klog) | + +### Helm Values + +See [values.yaml](../../deployments/helm/device-api-server/values.yaml) for the complete reference. + +Key configuration sections: + +```yaml +# Server configuration +server: + unixSocket: /var/run/device-api/device.sock + healthPort: 8081 + metricsPort: 9090 + shutdownGracePeriod: 25 + shutdownDelay: 5 + +# Node scheduling +nodeSelector: + nvidia.com/gpu.present: "true" + +# Resources +resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi +``` + +--- + +## GPU Providers + +The Device API Server is a pure Go gRPC server with no hardware dependencies. +GPU enumeration and health monitoring is provided by external providers that connect +as gRPC clients: + +- **nvml-provider sidecar** - Recommended NVML-based provider for GPU enumeration and XID monitoring +- **Custom providers** - Any gRPC client can register GPUs via `CreateGpu` and update health via `UpdateGpuStatus` + +See the [nvml-provider demo](../../demos/nvml-sidecar-demo.sh) for an example sidecar deployment. + +--- + +## Monitoring + +### Health Endpoints + +| Endpoint | Port | Description | +|----------|------|-------------| +| `/healthz` | 8081 | Liveness probe - server is running | +| `/readyz` | 8081 | Readiness probe - server is accepting traffic | +| `/metrics` | 9090 | Prometheus metrics | + +### Prometheus Metrics + +**Server Metrics**: + +| Metric | Type | Description | +|--------|------|-------------| +| `device_api_server_info` | Gauge | Server information (version, go_version) | + +**Cache Metrics**: + +| Metric | Type | Description | +|--------|------|-------------| +| `device_api_server_cache_gpus_total` | Gauge | Total GPUs in cache | +| `device_api_server_cache_gpus_healthy` | Gauge | Healthy GPUs | +| `device_api_server_cache_gpus_unhealthy` | Gauge | Unhealthy GPUs | +| `device_api_server_cache_gpus_unknown` | Gauge | GPUs with unknown status | +| `device_api_server_cache_updates_total` | Counter | Cache update operations | +| `device_api_server_cache_resource_version` | Gauge | Current cache version | + +**Watch Metrics**: + +| Metric | Type | Description | +|--------|------|-------------| +| `device_api_server_watch_streams_active` | Gauge | Active watch streams | +| `device_api_server_watch_events_total` | Counter | Watch events sent | + +### Alerting Rules + +When `metrics.prometheusRule.enabled=true`, the following alerts are created: + +| Alert | Severity | Condition | +|-------|----------|-----------| +| `DeviceAPIServerDown` | Critical | Server unreachable for 5m | +| `DeviceAPIServerHighLatency` | Warning | P99 latency > 500ms | +| `DeviceAPIServerHighErrorRate` | Warning | Error rate > 10% | +| `DeviceAPIServerUnhealthyGPUs` | Warning | Unhealthy GPUs > 0 | +| `DeviceAPIServerNoGPUs` | Warning | No GPUs for 10m | +| `DeviceAPIServerHighMemory` | Warning | Memory > 512MB | + +### Grafana Dashboard + +Example PromQL queries for dashboards: + +```promql +# GPU health overview +device_api_server_cache_gpus_healthy / device_api_server_cache_gpus_total * 100 + +# Watch stream activity +rate(device_api_server_watch_events_total[5m]) + +# Cache update rate +rate(device_api_server_cache_updates_total[5m]) +``` + +--- + +## Troubleshooting + +### Pod Not Scheduling + +**Symptom**: DaemonSet shows 0/N pods ready + +**Check**: + +```bash +# Verify node labels +kubectl get nodes --show-labels | grep gpu + +# Check DaemonSet events +kubectl describe daemonset -n device-api device-api-server +``` + +**Solution**: Ensure nodes have `nvidia.com/gpu.present=true` label or override `nodeSelector`. + +### Permission Denied on Unix Socket + +**Symptom**: Clients cannot connect to Unix socket + +**Check**: + +```bash +# Check socket permissions on node +ls -la /var/run/device-api/ +``` + +**Solution**: Verify `securityContext` allows socket creation, or adjust `runAsUser`. + +### GPUs Not Appearing + +**Symptom**: `ListGpus` returns empty + +**Check**: + +```bash +# Check for GPU enumeration errors +kubectl logs -n device-api | grep -i error + +# Check if provider sidecar is running +kubectl get pods -n device-api -o wide +``` + +**Solutions**: +1. Deploy the nvml-provider sidecar: see [nvml-provider demo](../../demos/nvml-sidecar-demo.sh) +2. Deploy an external health provider +3. Verify the provider can connect to the Device API Server + +### High Memory Usage + +**Symptom**: Pod OOMKilled or memory alerts firing + +**Check**: + +```bash +# Check current memory usage +kubectl top pods -n device-api + +# Check watch stream count +curl -s http://:9090/metrics | grep watch_streams +``` + +**Solutions**: +1. Increase memory limits +2. Investigate clients creating excessive watch streams +3. Check for memory leaks in logs + +### Watch Stream Disconnections + +**Symptom**: Consumers report frequent reconnections + +**Check**: + +```bash +# Check network policy +kubectl get networkpolicy -n device-api + +# Check for errors in logs +kubectl logs -n device-api | grep -i "stream\|watch" +``` + +**Solutions**: +1. Ensure network policies allow intra-node traffic +2. Check client timeout settings +3. Verify server is not overloaded + +--- + +## Graceful Shutdown + +The server implements graceful shutdown: + +1. **PreStop Hook**: Sleeps for `shutdownDelay` seconds +2. **Signal Handling**: Catches SIGTERM/SIGINT +3. **Drain Period**: Stops accepting new connections +4. **In-Flight Completion**: Waits for active requests (up to `shutdownTimeout`) +5. **Resource Cleanup**: Closes connections + +**Timeline**: + +``` +SIGTERM → [shutdownDelay] → Stop listeners → [shutdownGracePeriod] → Force close +``` + +Configure in Helm: + +```yaml +server: + shutdownGracePeriod: 25 # Max wait for in-flight requests (seconds) + shutdownDelay: 5 # Pre-shutdown delay for endpoint propagation (seconds) +``` + +--- + +## Security Considerations + +### Pod Security + +Default security context (non-root, restricted): + +```yaml +securityContext: + runAsNonRoot: true + runAsUser: 65534 + runAsGroup: 65534 + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL +``` + +### Network Security + +> **Warning**: The gRPC API is unauthenticated. + +- The gRPC device API binds to a **Unix domain socket** by default (`--bind-address=unix:///var/run/nvidia-device-api/device-api.sock`). This limits access to processes on the same node. +- The health probe endpoint (`--health-probe-bind-address`) binds to a TCP port for kubelet probes but only serves gRPC health and reflection, not the device API. +- In multi-tenant or partially untrusted clusters, use a Kubernetes `NetworkPolicy` to restrict access to the health and metrics TCP ports. + +### Service Account + +- `automountServiceAccountToken: false` by default +- No Kubernetes API access required + +--- + +## See Also + +- [API Reference](../api/device-api-server.md) +- [Design Document](../design/device-api-server.md) +- [Helm Chart README](../../deployments/helm/device-api-server/README.md) +- [NVML Sidecar Demo](../../demos/nvml-sidecar-demo.sh) diff --git a/examples/fake-client/main_test.go b/examples/fake-client/main_test.go index c552f566a..bc80953fe 100644 --- a/examples/fake-client/main_test.go +++ b/examples/fake-client/main_test.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ package main_test import ( "context" + "sync" "testing" "time" @@ -31,6 +32,82 @@ import ( "k8s.io/client-go/tools/cache" ) +// bookmarkWatch wraps a watch.Interface to inject a bookmark event after +// creation. This is needed because k8s.io/client-go v0.35+ requires bookmark +// events for the reflector to consider initial sync complete, but the fake +// client's ObjectTracker doesn't send them automatically. +type bookmarkWatch struct { + watch.Interface + bookmarkCh chan watch.Event + resultCh chan watch.Event + stopCh chan struct{} + stopOnce sync.Once +} + +func newBookmarkWatch(w watch.Interface) *bookmarkWatch { + bw := &bookmarkWatch{ + Interface: w, + bookmarkCh: make(chan watch.Event, 1), + resultCh: make(chan watch.Event), + stopCh: make(chan struct{}), + } + + // Send initial bookmark to signal list completion. + // The bookmark object must be the same type as the expected resource (GPU). + bw.bookmarkCh <- watch.Event{ + Type: watch.Bookmark, + Object: &devicev1alpha1.GPU{ + ObjectMeta: metav1.ObjectMeta{ + ResourceVersion: "0", + Annotations: map[string]string{ + metav1.InitialEventsAnnotationKey: "true", + }, + }, + }, + } + + // Multiplex bookmark and underlying watch events + go func() { + defer close(bw.resultCh) + for { + select { + case <-bw.stopCh: + return + case ev, ok := <-bw.bookmarkCh: + if ok { + select { + case bw.resultCh <- ev: + case <-bw.stopCh: + return + } + } + case ev, ok := <-w.ResultChan(): + if !ok { + return + } + select { + case bw.resultCh <- ev: + case <-bw.stopCh: + return + } + } + } + }() + + return bw +} + +func (bw *bookmarkWatch) ResultChan() <-chan watch.Event { + return bw.resultCh +} + +func (bw *bookmarkWatch) Stop() { + bw.stopOnce.Do(func() { + close(bw.stopCh) + }) + bw.Interface.Stop() +} + func TestGPUInformerWithFakeClient(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -47,6 +124,10 @@ func TestGPUInformerWithFakeClient(t *testing.T) { // signal the test when the informer has successfully established its // stream, preventing race conditions where events are injected before // the watcher is ready. + // + // The reactor also wraps the watch to inject a bookmark event, which is + // required by k8s.io/client-go v0.35+ for the reflector to consider the + // initial sync complete. client.PrependWatchReactor("*", func(action clienttesting.Action) (handled bool, ret watch.Interface, err error) { watchAction, ok := action.(clienttesting.WatchActionImpl) if !ok { @@ -58,15 +139,18 @@ func TestGPUInformerWithFakeClient(t *testing.T) { ns := action.GetNamespace() // Manually invoke the tracker to create the watch stream. - watch, err := client.Tracker().Watch(gvr, ns, opts) + w, err := client.Tracker().Watch(gvr, ns, opts) if err != nil { return false, nil, err } + // Wrap watch to inject initial bookmark event for reflector sync + wrappedWatch := newBookmarkWatch(w) + // Close the channel to notify the test that the Informer is now // listening for events. close(watcherStarted) - return true, watch, nil + return true, wrappedWatch, nil }) // Create a factory for the informers. diff --git a/go.mod b/go.mod index d1f0ae9d1..23a936b27 100644 --- a/go.mod +++ b/go.mod @@ -1,20 +1,20 @@ module github.com/nvidia/nvsentinel -go 1.25.5 +go 1.25.0 require ( + github.com/NVIDIA/go-nvml v0.12.9-0 github.com/go-logr/logr v1.4.3 github.com/go-logr/stdr v1.2.2 github.com/google/go-cmp v0.7.0 github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1 github.com/k3s-io/kine v1.14.2 github.com/prometheus/client_golang v1.23.2 - github.com/spf13/cobra v1.10.2 - github.com/spf13/pflag v1.0.9 + github.com/spf13/pflag v1.0.10 go.uber.org/goleak v1.3.0 golang.org/x/sync v0.18.0 - google.golang.org/grpc v1.78.0 - google.golang.org/protobuf v1.36.11 + google.golang.org/grpc v1.77.0 + google.golang.org/protobuf v1.36.10 k8s.io/apimachinery v0.35.0 k8s.io/apiserver v0.35.0 k8s.io/client-go v0.35.0 @@ -26,7 +26,6 @@ require ( require ( cel.dev/expr v0.24.0 // indirect filippo.io/edwards25519 v1.1.0 // indirect - github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect github.com/NYTimes/gziphandler v1.1.1 // indirect github.com/Rican7/retry v0.3.1 // indirect github.com/antlr4-go/antlr/v4 v4.13.0 // indirect @@ -75,7 +74,6 @@ require ( github.com/mailru/easyjson v0.7.7 // indirect github.com/mattn/go-sqlite3 v1.14.32 // indirect github.com/minio/highwayhash v1.0.3 // indirect - github.com/moby/term v0.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect @@ -93,6 +91,7 @@ require ( github.com/shengdoushi/base58 v1.0.0 // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/soheilhy/cmux v0.1.5 // indirect + github.com/spf13/cobra v1.10.0 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect github.com/tidwall/btree v1.8.1 // indirect github.com/tmc/grpc-websocket-proxy v0.0.0-20220101234140-673ab2c3ae75 // indirect @@ -128,7 +127,7 @@ require ( golang.org/x/text v0.31.0 // indirect golang.org/x/time v0.12.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20251029180050-ab9386a59fda // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect diff --git a/go.sum b/go.sum index 7c99213db..cb1e0c3fa 100644 --- a/go.sum +++ b/go.sum @@ -2,10 +2,10 @@ cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/NVIDIA/go-nvml v0.12.9-0 h1:e344UK8ZkeMeeLkdQtRhmXRxNf+u532LDZPGMtkdus0= +github.com/NVIDIA/go-nvml v0.12.9-0/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I= github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c= github.com/Rican7/retry v0.3.1 h1:scY4IbO8swckzoA/11HgBwaZRJEyY9vaNJshcdhp1Mc= @@ -32,8 +32,6 @@ github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8 github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= -github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= -github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -151,8 +149,6 @@ github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuE github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/minio/highwayhash v1.0.3 h1:kbnuUMoHYyVl7szWjSxJnxw11k2U709jqFPPmIUyD6Q= github.com/minio/highwayhash v1.0.3/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ= -github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= -github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -201,10 +197,11 @@ github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/soheilhy/cmux v0.1.5 h1:jjzc5WVemNEDTLwv9tlmemhC73tI08BNOIGwBOo10Js= github.com/soheilhy/cmux v0.1.5/go.mod h1:T7TcVDs9LWfQgPlPsdngu6I6QIoyIFZDDC6sNE1GqG0= -github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= -github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= -github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= -github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/cobra v1.10.0 h1:a5/WeUlSDCvV5a45ljW2ZFtV0bTDpkfSAj3uqB6Sc+0= +github.com/spf13/cobra v1.10.0/go.mod h1:9dhySC7dnTtEiqzmqfkLj47BslqLCUPMXjG2lj/NgoE= +github.com/spf13/pflag v1.0.8/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs= github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -309,7 +306,6 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= @@ -338,14 +334,14 @@ gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= -google.golang.org/genproto/googleapis/api v0.0.0-20251029180050-ab9386a59fda h1:+2XxjfsAu6vqFxwGBRcHiMaDCuZiqXGDUDVWVtrFAnE= -google.golang.org/genproto/googleapis/api v0.0.0-20251029180050-ab9386a59fda/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo= +google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 h1:mepRgnBZa07I4TRuomDE4sTIYieg/osKmzIf4USdWS4= +google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo= google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 h1:Wgl1rcDNThT+Zn47YyCXOXyX/COgMTIdhJ717F0l4xk= google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= -google.golang.org/grpc v1.78.0 h1:K1XZG/yGDJnzMdd/uZHAkVqJE+xIDOcmdSFZkBUicNc= -google.golang.org/grpc v1.78.0/go.mod h1:I47qjTo4OKbMkjA/aOOwxDIiPSBofUtQUI5EfpWvW7U= -google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= -google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= +google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= diff --git a/hack/boilerplate.go.txt b/hack/boilerplate.go.txt index e1732e8d5..6307eef7b 100644 --- a/hack/boilerplate.go.txt +++ b/hack/boilerplate.go.txt @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/internal/generated/device/v1alpha1/gpu.pb.go b/internal/generated/device/v1alpha1/gpu.pb.go index 17419e268..d184d3eb9 100644 --- a/internal/generated/device/v1alpha1/gpu.pb.go +++ b/internal/generated/device/v1alpha1/gpu.pb.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.36.10 -// protoc v6.33.0 +// protoc v6.33.4 // source: device/v1alpha1/gpu.proto package v1alpha1 @@ -1173,6 +1173,64 @@ func (x *UpdateGpuRequest) GetOpts() *UpdateOptions { return nil } +// UpdateGpuStatusRequest specifies the GPU whose status should be updated. +// Only metadata (name, namespace, resource_version) and status fields are used. +type UpdateGpuStatusRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + // gpu is the GPU resource with updated status. + // The server reads metadata.name, metadata.namespace, metadata.resource_version + // and status from this object. All other fields are ignored. + Gpu *Gpu `protobuf:"bytes,1,opt,name=gpu,proto3" json:"gpu,omitempty"` + // opts contains the options for the update. + Opts *UpdateOptions `protobuf:"bytes,2,opt,name=opts,proto3" json:"opts,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *UpdateGpuStatusRequest) Reset() { + *x = UpdateGpuStatusRequest{} + mi := &file_device_v1alpha1_gpu_proto_msgTypes[20] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *UpdateGpuStatusRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*UpdateGpuStatusRequest) ProtoMessage() {} + +func (x *UpdateGpuStatusRequest) ProtoReflect() protoreflect.Message { + mi := &file_device_v1alpha1_gpu_proto_msgTypes[20] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use UpdateGpuStatusRequest.ProtoReflect.Descriptor instead. +func (*UpdateGpuStatusRequest) Descriptor() ([]byte, []int) { + return file_device_v1alpha1_gpu_proto_rawDescGZIP(), []int{20} +} + +func (x *UpdateGpuStatusRequest) GetGpu() *Gpu { + if x != nil { + return x.Gpu + } + return nil +} + +func (x *UpdateGpuStatusRequest) GetOpts() *UpdateOptions { + if x != nil { + return x.Opts + } + return nil +} + type DeleteGpuRequest struct { state protoimpl.MessageState `protogen:"open.v1"` // The unique resource name of the GPU to delete. @@ -1190,7 +1248,7 @@ type DeleteGpuRequest struct { func (x *DeleteGpuRequest) Reset() { *x = DeleteGpuRequest{} - mi := &file_device_v1alpha1_gpu_proto_msgTypes[20] + mi := &file_device_v1alpha1_gpu_proto_msgTypes[21] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1202,7 +1260,7 @@ func (x *DeleteGpuRequest) String() string { func (*DeleteGpuRequest) ProtoMessage() {} func (x *DeleteGpuRequest) ProtoReflect() protoreflect.Message { - mi := &file_device_v1alpha1_gpu_proto_msgTypes[20] + mi := &file_device_v1alpha1_gpu_proto_msgTypes[21] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1215,7 +1273,7 @@ func (x *DeleteGpuRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use DeleteGpuRequest.ProtoReflect.Descriptor instead. func (*DeleteGpuRequest) Descriptor() ([]byte, []int) { - return file_device_v1alpha1_gpu_proto_rawDescGZIP(), []int{20} + return file_device_v1alpha1_gpu_proto_rawDescGZIP(), []int{21} } func (x *DeleteGpuRequest) GetName() string { @@ -1306,18 +1364,22 @@ const file_device_v1alpha1_gpu_proto_rawDesc = "" + "\x04opts\x18\x02 \x01(\v2).nvidia.nvsentinel.v1alpha1.CreateOptionsR\x04opts\"\x84\x01\n" + "\x10UpdateGpuRequest\x121\n" + "\x03gpu\x18\x01 \x01(\v2\x1f.nvidia.nvsentinel.v1alpha1.GpuR\x03gpu\x12=\n" + + "\x04opts\x18\x02 \x01(\v2).nvidia.nvsentinel.v1alpha1.UpdateOptionsR\x04opts\"\x8a\x01\n" + + "\x16UpdateGpuStatusRequest\x121\n" + + "\x03gpu\x18\x01 \x01(\v2\x1f.nvidia.nvsentinel.v1alpha1.GpuR\x03gpu\x12=\n" + "\x04opts\x18\x02 \x01(\v2).nvidia.nvsentinel.v1alpha1.UpdateOptionsR\x04opts\"\x83\x01\n" + "\x10DeleteGpuRequest\x12\x12\n" + "\x04name\x18\x01 \x01(\tR\x04name\x12\x1c\n" + "\tnamespace\x18\x02 \x01(\tR\tnamespace\x12=\n" + - "\x04opts\x18\x03 \x01(\v2).nvidia.nvsentinel.v1alpha1.DeleteOptionsR\x04opts2\xcb\x04\n" + + "\x04opts\x18\x03 \x01(\v2).nvidia.nvsentinel.v1alpha1.DeleteOptionsR\x04opts2\xb3\x05\n" + "\n" + "GpuService\x12_\n" + "\x06GetGpu\x12).nvidia.nvsentinel.v1alpha1.GetGpuRequest\x1a*.nvidia.nvsentinel.v1alpha1.GetGpuResponse\x12e\n" + "\bListGpus\x12+.nvidia.nvsentinel.v1alpha1.ListGpusRequest\x1a,.nvidia.nvsentinel.v1alpha1.ListGpusResponse\x12j\n" + "\tWatchGpus\x12,.nvidia.nvsentinel.v1alpha1.WatchGpusRequest\x1a-.nvidia.nvsentinel.v1alpha1.WatchGpusResponse0\x01\x12Z\n" + "\tCreateGpu\x12,.nvidia.nvsentinel.v1alpha1.CreateGpuRequest\x1a\x1f.nvidia.nvsentinel.v1alpha1.Gpu\x12Z\n" + - "\tUpdateGpu\x12,.nvidia.nvsentinel.v1alpha1.UpdateGpuRequest\x1a\x1f.nvidia.nvsentinel.v1alpha1.Gpu\x12Q\n" + + "\tUpdateGpu\x12,.nvidia.nvsentinel.v1alpha1.UpdateGpuRequest\x1a\x1f.nvidia.nvsentinel.v1alpha1.Gpu\x12f\n" + + "\x0fUpdateGpuStatus\x122.nvidia.nvsentinel.v1alpha1.UpdateGpuStatusRequest\x1a\x1f.nvidia.nvsentinel.v1alpha1.Gpu\x12Q\n" + "\tDeleteGpu\x12,.nvidia.nvsentinel.v1alpha1.DeleteGpuRequest\x1a\x16.google.protobuf.EmptyBJZHgithub.com/nvidia/nvsentinel/internal/generated/device/v1alpha1;v1alpha1b\x06proto3" var ( @@ -1332,41 +1394,42 @@ func file_device_v1alpha1_gpu_proto_rawDescGZIP() []byte { return file_device_v1alpha1_gpu_proto_rawDescData } -var file_device_v1alpha1_gpu_proto_msgTypes = make([]protoimpl.MessageInfo, 21) +var file_device_v1alpha1_gpu_proto_msgTypes = make([]protoimpl.MessageInfo, 22) var file_device_v1alpha1_gpu_proto_goTypes = []any{ - (*ObjectMeta)(nil), // 0: nvidia.nvsentinel.v1alpha1.ObjectMeta - (*ListMeta)(nil), // 1: nvidia.nvsentinel.v1alpha1.ListMeta - (*GetOptions)(nil), // 2: nvidia.nvsentinel.v1alpha1.GetOptions - (*ListOptions)(nil), // 3: nvidia.nvsentinel.v1alpha1.ListOptions - (*CreateOptions)(nil), // 4: nvidia.nvsentinel.v1alpha1.CreateOptions - (*UpdateOptions)(nil), // 5: nvidia.nvsentinel.v1alpha1.UpdateOptions - (*DeleteOptions)(nil), // 6: nvidia.nvsentinel.v1alpha1.DeleteOptions - (*Gpu)(nil), // 7: nvidia.nvsentinel.v1alpha1.Gpu - (*GpuList)(nil), // 8: nvidia.nvsentinel.v1alpha1.GpuList - (*GpuSpec)(nil), // 9: nvidia.nvsentinel.v1alpha1.GpuSpec - (*GpuStatus)(nil), // 10: nvidia.nvsentinel.v1alpha1.GpuStatus - (*Condition)(nil), // 11: nvidia.nvsentinel.v1alpha1.Condition - (*GetGpuRequest)(nil), // 12: nvidia.nvsentinel.v1alpha1.GetGpuRequest - (*GetGpuResponse)(nil), // 13: nvidia.nvsentinel.v1alpha1.GetGpuResponse - (*ListGpusRequest)(nil), // 14: nvidia.nvsentinel.v1alpha1.ListGpusRequest - (*ListGpusResponse)(nil), // 15: nvidia.nvsentinel.v1alpha1.ListGpusResponse - (*WatchGpusRequest)(nil), // 16: nvidia.nvsentinel.v1alpha1.WatchGpusRequest - (*WatchGpusResponse)(nil), // 17: nvidia.nvsentinel.v1alpha1.WatchGpusResponse - (*CreateGpuRequest)(nil), // 18: nvidia.nvsentinel.v1alpha1.CreateGpuRequest - (*UpdateGpuRequest)(nil), // 19: nvidia.nvsentinel.v1alpha1.UpdateGpuRequest - (*DeleteGpuRequest)(nil), // 20: nvidia.nvsentinel.v1alpha1.DeleteGpuRequest - (*timestamppb.Timestamp)(nil), // 21: google.protobuf.Timestamp - (*emptypb.Empty)(nil), // 22: google.protobuf.Empty + (*ObjectMeta)(nil), // 0: nvidia.nvsentinel.v1alpha1.ObjectMeta + (*ListMeta)(nil), // 1: nvidia.nvsentinel.v1alpha1.ListMeta + (*GetOptions)(nil), // 2: nvidia.nvsentinel.v1alpha1.GetOptions + (*ListOptions)(nil), // 3: nvidia.nvsentinel.v1alpha1.ListOptions + (*CreateOptions)(nil), // 4: nvidia.nvsentinel.v1alpha1.CreateOptions + (*UpdateOptions)(nil), // 5: nvidia.nvsentinel.v1alpha1.UpdateOptions + (*DeleteOptions)(nil), // 6: nvidia.nvsentinel.v1alpha1.DeleteOptions + (*Gpu)(nil), // 7: nvidia.nvsentinel.v1alpha1.Gpu + (*GpuList)(nil), // 8: nvidia.nvsentinel.v1alpha1.GpuList + (*GpuSpec)(nil), // 9: nvidia.nvsentinel.v1alpha1.GpuSpec + (*GpuStatus)(nil), // 10: nvidia.nvsentinel.v1alpha1.GpuStatus + (*Condition)(nil), // 11: nvidia.nvsentinel.v1alpha1.Condition + (*GetGpuRequest)(nil), // 12: nvidia.nvsentinel.v1alpha1.GetGpuRequest + (*GetGpuResponse)(nil), // 13: nvidia.nvsentinel.v1alpha1.GetGpuResponse + (*ListGpusRequest)(nil), // 14: nvidia.nvsentinel.v1alpha1.ListGpusRequest + (*ListGpusResponse)(nil), // 15: nvidia.nvsentinel.v1alpha1.ListGpusResponse + (*WatchGpusRequest)(nil), // 16: nvidia.nvsentinel.v1alpha1.WatchGpusRequest + (*WatchGpusResponse)(nil), // 17: nvidia.nvsentinel.v1alpha1.WatchGpusResponse + (*CreateGpuRequest)(nil), // 18: nvidia.nvsentinel.v1alpha1.CreateGpuRequest + (*UpdateGpuRequest)(nil), // 19: nvidia.nvsentinel.v1alpha1.UpdateGpuRequest + (*UpdateGpuStatusRequest)(nil), // 20: nvidia.nvsentinel.v1alpha1.UpdateGpuStatusRequest + (*DeleteGpuRequest)(nil), // 21: nvidia.nvsentinel.v1alpha1.DeleteGpuRequest + (*timestamppb.Timestamp)(nil), // 22: google.protobuf.Timestamp + (*emptypb.Empty)(nil), // 23: google.protobuf.Empty } var file_device_v1alpha1_gpu_proto_depIdxs = []int32{ - 21, // 0: nvidia.nvsentinel.v1alpha1.ObjectMeta.creation_timestamp:type_name -> google.protobuf.Timestamp + 22, // 0: nvidia.nvsentinel.v1alpha1.ObjectMeta.creation_timestamp:type_name -> google.protobuf.Timestamp 0, // 1: nvidia.nvsentinel.v1alpha1.Gpu.metadata:type_name -> nvidia.nvsentinel.v1alpha1.ObjectMeta 9, // 2: nvidia.nvsentinel.v1alpha1.Gpu.spec:type_name -> nvidia.nvsentinel.v1alpha1.GpuSpec 10, // 3: nvidia.nvsentinel.v1alpha1.Gpu.status:type_name -> nvidia.nvsentinel.v1alpha1.GpuStatus 1, // 4: nvidia.nvsentinel.v1alpha1.GpuList.metadata:type_name -> nvidia.nvsentinel.v1alpha1.ListMeta 7, // 5: nvidia.nvsentinel.v1alpha1.GpuList.items:type_name -> nvidia.nvsentinel.v1alpha1.Gpu 11, // 6: nvidia.nvsentinel.v1alpha1.GpuStatus.conditions:type_name -> nvidia.nvsentinel.v1alpha1.Condition - 21, // 7: nvidia.nvsentinel.v1alpha1.Condition.last_transition_time:type_name -> google.protobuf.Timestamp + 22, // 7: nvidia.nvsentinel.v1alpha1.Condition.last_transition_time:type_name -> google.protobuf.Timestamp 2, // 8: nvidia.nvsentinel.v1alpha1.GetGpuRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.GetOptions 7, // 9: nvidia.nvsentinel.v1alpha1.GetGpuResponse.gpu:type_name -> nvidia.nvsentinel.v1alpha1.Gpu 3, // 10: nvidia.nvsentinel.v1alpha1.ListGpusRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.ListOptions @@ -1377,24 +1440,28 @@ var file_device_v1alpha1_gpu_proto_depIdxs = []int32{ 4, // 15: nvidia.nvsentinel.v1alpha1.CreateGpuRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.CreateOptions 7, // 16: nvidia.nvsentinel.v1alpha1.UpdateGpuRequest.gpu:type_name -> nvidia.nvsentinel.v1alpha1.Gpu 5, // 17: nvidia.nvsentinel.v1alpha1.UpdateGpuRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.UpdateOptions - 6, // 18: nvidia.nvsentinel.v1alpha1.DeleteGpuRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.DeleteOptions - 12, // 19: nvidia.nvsentinel.v1alpha1.GpuService.GetGpu:input_type -> nvidia.nvsentinel.v1alpha1.GetGpuRequest - 14, // 20: nvidia.nvsentinel.v1alpha1.GpuService.ListGpus:input_type -> nvidia.nvsentinel.v1alpha1.ListGpusRequest - 16, // 21: nvidia.nvsentinel.v1alpha1.GpuService.WatchGpus:input_type -> nvidia.nvsentinel.v1alpha1.WatchGpusRequest - 18, // 22: nvidia.nvsentinel.v1alpha1.GpuService.CreateGpu:input_type -> nvidia.nvsentinel.v1alpha1.CreateGpuRequest - 19, // 23: nvidia.nvsentinel.v1alpha1.GpuService.UpdateGpu:input_type -> nvidia.nvsentinel.v1alpha1.UpdateGpuRequest - 20, // 24: nvidia.nvsentinel.v1alpha1.GpuService.DeleteGpu:input_type -> nvidia.nvsentinel.v1alpha1.DeleteGpuRequest - 13, // 25: nvidia.nvsentinel.v1alpha1.GpuService.GetGpu:output_type -> nvidia.nvsentinel.v1alpha1.GetGpuResponse - 15, // 26: nvidia.nvsentinel.v1alpha1.GpuService.ListGpus:output_type -> nvidia.nvsentinel.v1alpha1.ListGpusResponse - 17, // 27: nvidia.nvsentinel.v1alpha1.GpuService.WatchGpus:output_type -> nvidia.nvsentinel.v1alpha1.WatchGpusResponse - 7, // 28: nvidia.nvsentinel.v1alpha1.GpuService.CreateGpu:output_type -> nvidia.nvsentinel.v1alpha1.Gpu - 7, // 29: nvidia.nvsentinel.v1alpha1.GpuService.UpdateGpu:output_type -> nvidia.nvsentinel.v1alpha1.Gpu - 22, // 30: nvidia.nvsentinel.v1alpha1.GpuService.DeleteGpu:output_type -> google.protobuf.Empty - 25, // [25:31] is the sub-list for method output_type - 19, // [19:25] is the sub-list for method input_type - 19, // [19:19] is the sub-list for extension type_name - 19, // [19:19] is the sub-list for extension extendee - 0, // [0:19] is the sub-list for field type_name + 7, // 18: nvidia.nvsentinel.v1alpha1.UpdateGpuStatusRequest.gpu:type_name -> nvidia.nvsentinel.v1alpha1.Gpu + 5, // 19: nvidia.nvsentinel.v1alpha1.UpdateGpuStatusRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.UpdateOptions + 6, // 20: nvidia.nvsentinel.v1alpha1.DeleteGpuRequest.opts:type_name -> nvidia.nvsentinel.v1alpha1.DeleteOptions + 12, // 21: nvidia.nvsentinel.v1alpha1.GpuService.GetGpu:input_type -> nvidia.nvsentinel.v1alpha1.GetGpuRequest + 14, // 22: nvidia.nvsentinel.v1alpha1.GpuService.ListGpus:input_type -> nvidia.nvsentinel.v1alpha1.ListGpusRequest + 16, // 23: nvidia.nvsentinel.v1alpha1.GpuService.WatchGpus:input_type -> nvidia.nvsentinel.v1alpha1.WatchGpusRequest + 18, // 24: nvidia.nvsentinel.v1alpha1.GpuService.CreateGpu:input_type -> nvidia.nvsentinel.v1alpha1.CreateGpuRequest + 19, // 25: nvidia.nvsentinel.v1alpha1.GpuService.UpdateGpu:input_type -> nvidia.nvsentinel.v1alpha1.UpdateGpuRequest + 20, // 26: nvidia.nvsentinel.v1alpha1.GpuService.UpdateGpuStatus:input_type -> nvidia.nvsentinel.v1alpha1.UpdateGpuStatusRequest + 21, // 27: nvidia.nvsentinel.v1alpha1.GpuService.DeleteGpu:input_type -> nvidia.nvsentinel.v1alpha1.DeleteGpuRequest + 13, // 28: nvidia.nvsentinel.v1alpha1.GpuService.GetGpu:output_type -> nvidia.nvsentinel.v1alpha1.GetGpuResponse + 15, // 29: nvidia.nvsentinel.v1alpha1.GpuService.ListGpus:output_type -> nvidia.nvsentinel.v1alpha1.ListGpusResponse + 17, // 30: nvidia.nvsentinel.v1alpha1.GpuService.WatchGpus:output_type -> nvidia.nvsentinel.v1alpha1.WatchGpusResponse + 7, // 31: nvidia.nvsentinel.v1alpha1.GpuService.CreateGpu:output_type -> nvidia.nvsentinel.v1alpha1.Gpu + 7, // 32: nvidia.nvsentinel.v1alpha1.GpuService.UpdateGpu:output_type -> nvidia.nvsentinel.v1alpha1.Gpu + 7, // 33: nvidia.nvsentinel.v1alpha1.GpuService.UpdateGpuStatus:output_type -> nvidia.nvsentinel.v1alpha1.Gpu + 23, // 34: nvidia.nvsentinel.v1alpha1.GpuService.DeleteGpu:output_type -> google.protobuf.Empty + 28, // [28:35] is the sub-list for method output_type + 21, // [21:28] is the sub-list for method input_type + 21, // [21:21] is the sub-list for extension type_name + 21, // [21:21] is the sub-list for extension extendee + 0, // [0:21] is the sub-list for field type_name } func init() { file_device_v1alpha1_gpu_proto_init() } @@ -1408,7 +1475,7 @@ func file_device_v1alpha1_gpu_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_device_v1alpha1_gpu_proto_rawDesc), len(file_device_v1alpha1_gpu_proto_rawDesc)), NumEnums: 0, - NumMessages: 21, + NumMessages: 22, NumExtensions: 0, NumServices: 1, }, diff --git a/internal/generated/device/v1alpha1/gpu_grpc.pb.go b/internal/generated/device/v1alpha1/gpu_grpc.pb.go index c31f32a56..2590d7ca7 100644 --- a/internal/generated/device/v1alpha1/gpu_grpc.pb.go +++ b/internal/generated/device/v1alpha1/gpu_grpc.pb.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.5.1 -// - protoc v6.33.0 +// - protoc v6.33.4 // source: device/v1alpha1/gpu.proto package v1alpha1 @@ -34,12 +34,13 @@ import ( const _ = grpc.SupportPackageIsVersion9 const ( - GpuService_GetGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/GetGpu" - GpuService_ListGpus_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/ListGpus" - GpuService_WatchGpus_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/WatchGpus" - GpuService_CreateGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/CreateGpu" - GpuService_UpdateGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/UpdateGpu" - GpuService_DeleteGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/DeleteGpu" + GpuService_GetGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/GetGpu" + GpuService_ListGpus_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/ListGpus" + GpuService_WatchGpus_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/WatchGpus" + GpuService_CreateGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/CreateGpu" + GpuService_UpdateGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/UpdateGpu" + GpuService_UpdateGpuStatus_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/UpdateGpuStatus" + GpuService_DeleteGpu_FullMethodName = "/nvidia.nvsentinel.v1alpha1.GpuService/DeleteGpu" ) // GpuServiceClient is the client API for GpuService service. @@ -58,6 +59,8 @@ type GpuServiceClient interface { CreateGpu(ctx context.Context, in *CreateGpuRequest, opts ...grpc.CallOption) (*Gpu, error) // UpdateGpu updates a single GPU resource. UpdateGpu(ctx context.Context, in *UpdateGpuRequest, opts ...grpc.CallOption) (*Gpu, error) + // UpdateGpuStatus updates only the status subresource of a GPU. + UpdateGpuStatus(ctx context.Context, in *UpdateGpuStatusRequest, opts ...grpc.CallOption) (*Gpu, error) // DeleteGpu deletes a single GPU resource. DeleteGpu(ctx context.Context, in *DeleteGpuRequest, opts ...grpc.CallOption) (*emptypb.Empty, error) } @@ -129,6 +132,16 @@ func (c *gpuServiceClient) UpdateGpu(ctx context.Context, in *UpdateGpuRequest, return out, nil } +func (c *gpuServiceClient) UpdateGpuStatus(ctx context.Context, in *UpdateGpuStatusRequest, opts ...grpc.CallOption) (*Gpu, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(Gpu) + err := c.cc.Invoke(ctx, GpuService_UpdateGpuStatus_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + func (c *gpuServiceClient) DeleteGpu(ctx context.Context, in *DeleteGpuRequest, opts ...grpc.CallOption) (*emptypb.Empty, error) { cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) out := new(emptypb.Empty) @@ -155,6 +168,8 @@ type GpuServiceServer interface { CreateGpu(context.Context, *CreateGpuRequest) (*Gpu, error) // UpdateGpu updates a single GPU resource. UpdateGpu(context.Context, *UpdateGpuRequest) (*Gpu, error) + // UpdateGpuStatus updates only the status subresource of a GPU. + UpdateGpuStatus(context.Context, *UpdateGpuStatusRequest) (*Gpu, error) // DeleteGpu deletes a single GPU resource. DeleteGpu(context.Context, *DeleteGpuRequest) (*emptypb.Empty, error) mustEmbedUnimplementedGpuServiceServer() @@ -182,6 +197,9 @@ func (UnimplementedGpuServiceServer) CreateGpu(context.Context, *CreateGpuReques func (UnimplementedGpuServiceServer) UpdateGpu(context.Context, *UpdateGpuRequest) (*Gpu, error) { return nil, status.Errorf(codes.Unimplemented, "method UpdateGpu not implemented") } +func (UnimplementedGpuServiceServer) UpdateGpuStatus(context.Context, *UpdateGpuStatusRequest) (*Gpu, error) { + return nil, status.Errorf(codes.Unimplemented, "method UpdateGpuStatus not implemented") +} func (UnimplementedGpuServiceServer) DeleteGpu(context.Context, *DeleteGpuRequest) (*emptypb.Empty, error) { return nil, status.Errorf(codes.Unimplemented, "method DeleteGpu not implemented") } @@ -289,6 +307,24 @@ func _GpuService_UpdateGpu_Handler(srv interface{}, ctx context.Context, dec fun return interceptor(ctx, in, info, handler) } +func _GpuService_UpdateGpuStatus_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(UpdateGpuStatusRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(GpuServiceServer).UpdateGpuStatus(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: GpuService_UpdateGpuStatus_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(GpuServiceServer).UpdateGpuStatus(ctx, req.(*UpdateGpuStatusRequest)) + } + return interceptor(ctx, in, info, handler) +} + func _GpuService_DeleteGpu_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { in := new(DeleteGpuRequest) if err := dec(in); err != nil { @@ -330,6 +366,10 @@ var GpuService_ServiceDesc = grpc.ServiceDesc{ MethodName: "UpdateGpu", Handler: _GpuService_UpdateGpu_Handler, }, + { + MethodName: "UpdateGpuStatus", + Handler: _GpuService_UpdateGpuStatus_Handler, + }, { MethodName: "DeleteGpu", Handler: _GpuService_DeleteGpu_Handler, diff --git a/pkg/client-go/client/versioned/clientset.go b/pkg/client-go/client/versioned/clientset.go index 0779de3d7..6a7505817 100644 --- a/pkg/client-go/client/versioned/clientset.go +++ b/pkg/client-go/client/versioned/clientset.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/client/versioned/fake/clientset_generated.go b/pkg/client-go/client/versioned/fake/clientset_generated.go index e0118c0f8..71cea6c1b 100644 --- a/pkg/client-go/client/versioned/fake/clientset_generated.go +++ b/pkg/client-go/client/versioned/fake/clientset_generated.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/client/versioned/fake/doc.go b/pkg/client-go/client/versioned/fake/doc.go index 44b048c89..f6c7d06f0 100644 --- a/pkg/client-go/client/versioned/fake/doc.go +++ b/pkg/client-go/client/versioned/fake/doc.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/client/versioned/fake/register.go b/pkg/client-go/client/versioned/fake/register.go index 1573cb4f7..a2d9f7802 100644 --- a/pkg/client-go/client/versioned/fake/register.go +++ b/pkg/client-go/client/versioned/fake/register.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/client/versioned/scheme/doc.go b/pkg/client-go/client/versioned/scheme/doc.go index 55f52dc51..a3f9c58bc 100644 --- a/pkg/client-go/client/versioned/scheme/doc.go +++ b/pkg/client-go/client/versioned/scheme/doc.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/client/versioned/scheme/register.go b/pkg/client-go/client/versioned/scheme/register.go index 97cf5a8ff..46045b406 100644 --- a/pkg/client-go/client/versioned/scheme/register.go +++ b/pkg/client-go/client/versioned/scheme/register.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/device_client.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/device_client.go index 01b63b877..d2ca86aa3 100644 --- a/pkg/client-go/client/versioned/typed/device/v1alpha1/device_client.go +++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/device_client.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/doc.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/doc.go index 7749c1800..c689ab840 100644 --- a/pkg/client-go/client/versioned/typed/device/v1alpha1/doc.go +++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/doc.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/doc.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/doc.go index 2702a5453..942a10f72 100644 --- a/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/doc.go +++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/doc.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_device_client.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_device_client.go index 32c7c5401..5bd437c2c 100644 --- a/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_device_client.go +++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_device_client.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_gpu.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_gpu.go index 192da6fa1..e68564670 100644 --- a/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_gpu.go +++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/fake/fake_gpu.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -110,6 +110,15 @@ func (c *fakeGPUs) Update(ctx context.Context, gPU *devicev1alpha1.GPU, opts v1. return obj.(*devicev1alpha1.GPU), err } +func (c *fakeGPUs) UpdateStatus(ctx context.Context, gPU *devicev1alpha1.GPU, opts v1.UpdateOptions) (*devicev1alpha1.GPU, error) { + obj, err := c.Fake. + Invokes(testing.NewRootUpdateSubresourceActionWithOptions(c.Resource(), "status", gPU, opts), &devicev1alpha1.GPU{}) + if obj == nil { + return nil, err + } + return obj.(*devicev1alpha1.GPU), err +} + // Delete takes name of the gPU and deletes it. Returns an error if one occurs. func (c *fakeGPUs) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error { _, err := c.Fake. diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/generated_expansion.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/generated_expansion.go index c99bbb48c..97d724146 100644 --- a/pkg/client-go/client/versioned/typed/device/v1alpha1/generated_expansion.go +++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/generated_expansion.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/client/versioned/typed/device/v1alpha1/gpu.go b/pkg/client-go/client/versioned/typed/device/v1alpha1/gpu.go index 4328d58a5..734754200 100644 --- a/pkg/client-go/client/versioned/typed/device/v1alpha1/gpu.go +++ b/pkg/client-go/client/versioned/typed/device/v1alpha1/gpu.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -38,6 +38,7 @@ type GPUsGetter interface { type GPUInterface interface { Create(ctx context.Context, gPU *devicev1alpha1.GPU, opts v1.CreateOptions) (*devicev1alpha1.GPU, error) Update(ctx context.Context, gPU *devicev1alpha1.GPU, opts v1.UpdateOptions) (*devicev1alpha1.GPU, error) + UpdateStatus(ctx context.Context, gPU *devicev1alpha1.GPU, opts v1.UpdateOptions) (*devicev1alpha1.GPU, error) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error Get(ctx context.Context, name string, opts v1.GetOptions) (*devicev1alpha1.GPU, error) List(ctx context.Context, opts v1.ListOptions) (*devicev1alpha1.GPUList, error) @@ -191,6 +192,26 @@ func (c *gpus) Update(ctx context.Context, gpu *devicev1alpha1.GPU, opts v1.Upda return obj, nil } +// UpdateStatus updates only the status subresource of a GPU. +func (c *gpus) UpdateStatus(ctx context.Context, gpu *devicev1alpha1.GPU, opts v1.UpdateOptions) (*devicev1alpha1.GPU, error) { + resp, err := c.client.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{ + Gpu: devicev1alpha1.ToProto(gpu), + Opts: &pb.UpdateOptions{}, + }) + if err != nil { + return nil, err + } + + obj := devicev1alpha1.FromProto(resp) + c.logger.V(2).Info("Updated GPU status", + "name", obj.GetName(), + "namespace", c.getNamespace(), + "resource-version", obj.GetResourceVersion(), + ) + + return obj, nil +} + // TODO: Implement DeleteOptions support. func (c *gpus) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error { _, err := c.client.DeleteGpu(ctx, &pb.DeleteGpuRequest{ diff --git a/pkg/client-go/informers/externalversions/device/interface.go b/pkg/client-go/informers/externalversions/device/interface.go index 871a7d07f..702c09212 100644 --- a/pkg/client-go/informers/externalversions/device/interface.go +++ b/pkg/client-go/informers/externalversions/device/interface.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/informers/externalversions/device/v1alpha1/gpu.go b/pkg/client-go/informers/externalversions/device/v1alpha1/gpu.go index db5da81ac..b5f6f419f 100644 --- a/pkg/client-go/informers/externalversions/device/v1alpha1/gpu.go +++ b/pkg/client-go/informers/externalversions/device/v1alpha1/gpu.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/informers/externalversions/device/v1alpha1/interface.go b/pkg/client-go/informers/externalversions/device/v1alpha1/interface.go index f3921c8e3..68303b6eb 100644 --- a/pkg/client-go/informers/externalversions/device/v1alpha1/interface.go +++ b/pkg/client-go/informers/externalversions/device/v1alpha1/interface.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/informers/externalversions/factory.go b/pkg/client-go/informers/externalversions/factory.go index 296c50425..cbf2ef267 100644 --- a/pkg/client-go/informers/externalversions/factory.go +++ b/pkg/client-go/informers/externalversions/factory.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/informers/externalversions/generic.go b/pkg/client-go/informers/externalversions/generic.go index f8ccccacc..0382aab5b 100644 --- a/pkg/client-go/informers/externalversions/generic.go +++ b/pkg/client-go/informers/externalversions/generic.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go b/pkg/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go index 35543b30e..f63107c96 100644 --- a/pkg/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go +++ b/pkg/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/listers/device/v1alpha1/expansion_generated.go b/pkg/client-go/listers/device/v1alpha1/expansion_generated.go index 1aa65cee4..011529aa5 100644 --- a/pkg/client-go/listers/device/v1alpha1/expansion_generated.go +++ b/pkg/client-go/listers/device/v1alpha1/expansion_generated.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/client-go/listers/device/v1alpha1/gpu.go b/pkg/client-go/listers/device/v1alpha1/gpu.go index 709bd429f..2ea778590 100644 --- a/pkg/client-go/listers/device/v1alpha1/gpu.go +++ b/pkg/client-go/listers/device/v1alpha1/gpu.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/controlplane/apiserver/config.go b/pkg/controlplane/apiserver/config.go index bb3d8bff7..0fa090d3b 100644 --- a/pkg/controlplane/apiserver/config.go +++ b/pkg/controlplane/apiserver/config.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,7 +30,7 @@ import ( "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/metrics" "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/options" "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/registry" - "github.com/nvidia/nvsentinel/pkg/util/version" + "github.com/nvidia/nvsentinel/pkg/version" ) type Config struct { diff --git a/pkg/controlplane/apiserver/metrics/metrics.go b/pkg/controlplane/apiserver/metrics/metrics.go index 98056ec81..2618ebebc 100644 --- a/pkg/controlplane/apiserver/metrics/metrics.go +++ b/pkg/controlplane/apiserver/metrics/metrics.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ import ( "sync" grpcprom "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus" - "github.com/nvidia/nvsentinel/pkg/util/version" + "github.com/nvidia/nvsentinel/pkg/version" "github.com/prometheus/client_golang/prometheus" "google.golang.org/grpc" "k8s.io/klog/v2" @@ -31,14 +31,18 @@ type ServerMetrics struct { Registry *prometheus.Registry Collectors *grpcprom.ServerMetrics ServiceHealthStatus *prometheus.GaugeVec + mu sync.Mutex buildInfoLabels prometheus.Labels registerOnce sync.Once } // WithBuildInfo populates the metadata labels used by the build_info metric. +// Must be called before Register() and only from a single goroutine (typically during init). func (m *ServerMetrics) WithBuildInfo(info version.Info) *ServerMetrics { + m.mu.Lock() + defer m.mu.Unlock() m.buildInfoLabels = prometheus.Labels{ - "version": info.GitVersion, + "version": info.Version, "revision": info.GitCommit, "build_date": info.BuildDate, "goversion": info.GoVersion, @@ -79,11 +83,15 @@ func (m *ServerMetrics) Register() { klog.ErrorS(err, "Failed to register service health metrics") } - if m.buildInfoLabels != nil { + m.mu.Lock() + labels := m.buildInfoLabels + m.mu.Unlock() + + if labels != nil { version := prometheus.NewGauge(prometheus.GaugeOpts{ Name: "device_apiserver_build_info", Help: "Build information about the device-apiserver binary.", - ConstLabels: m.buildInfoLabels, + ConstLabels: labels, }) version.Set(1) diff --git a/pkg/controlplane/apiserver/options/grpc/options.go b/pkg/controlplane/apiserver/options/grpc/options.go index 238700c8b..ff46b4728 100644 --- a/pkg/controlplane/apiserver/options/grpc/options.go +++ b/pkg/controlplane/apiserver/options/grpc/options.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -125,8 +125,6 @@ func (o *Options) Complete() (CompletedOptions, error) { o.MinPingInterval = 5 * time.Second } - o.PermitWithoutStream = true - completed := completedOptions{ Options: *o, } @@ -197,12 +195,6 @@ func (o *Options) Validate() []error { o.MinPingInterval)) } - if !o.PermitWithoutStream { - allErrors = append(allErrors, - fmt.Errorf("permit-without-stream: %v must be true to allow keepalive pings without active streams", - o.PermitWithoutStream)) - } - return allErrors } diff --git a/pkg/controlplane/apiserver/options/grpc/options_test.go b/pkg/controlplane/apiserver/options/grpc/options_test.go index eb725f423..f39e52f4c 100644 --- a/pkg/controlplane/apiserver/options/grpc/options_test.go +++ b/pkg/controlplane/apiserver/options/grpc/options_test.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -75,9 +75,8 @@ func TestComplete(t *testing.T) { if completed.MaxRecvMsgSize != 4194304 { t.Errorf("expected default recv size 4MiB, got %d", completed.MaxRecvMsgSize) } - if !completed.PermitWithoutStream { - t.Error("PermitWithoutStream should be forced to true") - } + // PermitWithoutStream defaults to true via NewOptions(), not forced by Complete(). + // A zero-value Options{} will have PermitWithoutStream=false since there is no flag for it. }) t.Run("Preserve user overrides", func(t *testing.T) { diff --git a/pkg/controlplane/apiserver/options/options.go b/pkg/controlplane/apiserver/options/options.go index 113523ef5..c6b5b9470 100644 --- a/pkg/controlplane/apiserver/options/options.go +++ b/pkg/controlplane/apiserver/options/options.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ package options import ( "context" "fmt" + "net" "os" "strings" "time" @@ -116,6 +117,8 @@ func (o *Options) Complete(ctx context.Context) (CompletedOptions, error) { o.NodeName = strings.ToLower(strings.TrimSpace(o.NodeName)) //nolint:wsl if o.HealthAddress == "" { + // Default binds to all interfaces for Kubernetes kubelet health probes. + // Use NetworkPolicy to restrict access in production. o.HealthAddress = ":50051" } @@ -124,6 +127,8 @@ func (o *Options) Complete(ctx context.Context) (CompletedOptions, error) { } if o.MetricsAddress == "" { + // Default binds to all interfaces for Prometheus scraping. + // Use NetworkPolicy to restrict access in production. o.MetricsAddress = ":9090" } @@ -203,10 +208,15 @@ func (o *CompletedOptions) Validate() []error { } } - if o.HealthAddress != "" && o.HealthAddress == o.MetricsAddress { - allErrors = append(allErrors, - fmt.Errorf("health-probe-bind-address and metrics-bind-address: must not be the same (%s)", - o.HealthAddress)) + if o.HealthAddress != "" && o.MetricsAddress != "" { + _, healthPort, _ := net.SplitHostPort(o.HealthAddress) + _, metricsPort, _ := net.SplitHostPort(o.MetricsAddress) + + if healthPort != "" && healthPort == metricsPort { + allErrors = append(allErrors, + fmt.Errorf("health-probe-bind-address and metrics-bind-address: must not use the same port (%s)", + healthPort)) + } } if o.ShutdownGracePeriod < 0 { diff --git a/pkg/controlplane/apiserver/options/options_test.go b/pkg/controlplane/apiserver/options/options_test.go index 0f905a6bc..1108e9f7e 100644 --- a/pkg/controlplane/apiserver/options/options_test.go +++ b/pkg/controlplane/apiserver/options/options_test.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -158,7 +158,7 @@ func TestValidate(t *testing.T) { o.MetricsAddress = ":8080" }, wantErr: true, - errContains: "must not be the same (:8080)", + errContains: "must not use the same port (8080)", }, { name: "Negative service monitor period", diff --git a/pkg/controlplane/apiserver/server.go b/pkg/controlplane/apiserver/server.go index a7a616aa9..41ecdb00a 100644 --- a/pkg/controlplane/apiserver/server.go +++ b/pkg/controlplane/apiserver/server.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ import ( "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/metrics" "github.com/nvidia/nvsentinel/pkg/storage/storagebackend" netutils "github.com/nvidia/nvsentinel/pkg/util/net" - "github.com/nvidia/nvsentinel/pkg/util/version" + "github.com/nvidia/nvsentinel/pkg/version" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "google.golang.org/grpc" @@ -36,6 +36,7 @@ import ( "google.golang.org/grpc/health" healthpb "google.golang.org/grpc/health/grpc_health_v1" "google.golang.org/grpc/reflection" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" ) @@ -51,7 +52,6 @@ type DeviceAPIServer struct { AdminServer *grpc.Server AdminCleanup func() Metrics *metrics.ServerMetrics - MetricsRegistry *prometheus.Registry Storage *storagebackend.Storage ServiceProviders []api.ServiceProvider mu sync.RWMutex @@ -92,9 +92,16 @@ func (s *DeviceAPIServer) PrepareRun(ctx context.Context) (preparedDeviceAPIServ if s.HealthAddress != "" { s.HealthServer = health.NewServer() healthpb.RegisterHealthServer(s.AdminServer, s.HealthServer) + // Also register on DeviceServer so sidecar providers connecting via + // unix socket can perform health checks without a separate connection. + healthpb.RegisterHealthServer(s.DeviceServer, s.HealthServer) s.HealthServer.SetServingStatus("", healthpb.HealthCheckResponse_NOT_SERVING) } + // Enable gRPC reflection on both servers. This is intentional: + // - DeviceServer: allows grpcurl/grpc_cli debugging + // - AdminServer: required for channelz and admin tooling + // To restrict in production, use NetworkPolicy on the admin port. reflection.Register(s.DeviceServer) reflection.Register(s.AdminServer) @@ -139,13 +146,27 @@ func (s *DeviceAPIServer) run(ctx context.Context) error { go func() { defer s.wg.Done() + defer func() { + if r := recover(); r != nil { + klog.ErrorS(nil, "Health monitor panicked, setting NOT_SERVING", "panic", r) + + if s.HealthServer != nil { + s.HealthServer.SetServingStatus("", healthpb.HealthCheckResponse_NOT_SERVING) + } + } + }() + s.monitorServiceHealth(ctx) }() } if s.MetricsAddress != "" { - // TODO: put in wg?? - go s.serveMetrics(ctx) + s.wg.Add(1) + go func() { + defer s.wg.Done() + + s.serveMetrics(ctx) + }() } if err := s.waitForStorage(ctx); err != nil { @@ -174,7 +195,18 @@ func (s *DeviceAPIServer) run(ctx context.Context) error { s.DeviceServer.GracefulStop() if s.AdminServer != nil { - s.AdminServer.GracefulStop() + adminDone := make(chan struct{}) + go func() { + s.AdminServer.GracefulStop() + close(adminDone) + }() + + select { + case <-adminDone: + case <-time.After(s.ShutdownGracePeriod): + logger.V(2).Info("AdminServer graceful stop timed out, forcing stop") + s.AdminServer.Stop() + } } if s.AdminCleanup != nil { @@ -214,14 +246,17 @@ func (s *DeviceAPIServer) serveHealth(ctx context.Context) { // to unblock Serve and reject new conns. go func() { <-ctx.Done() - lis.Close() + + if err := lis.Close(); err != nil { + logger.Error(err, "Failed to close health listener", "address", s.HealthAddress) + } }() logger.V(2).Info("Starting health server", "address", s.HealthAddress) serveErr := s.AdminServer.Serve(lis) if serveErr != nil && !errors.Is(serveErr, grpc.ErrServerStopped) && !errors.Is(serveErr, net.ErrClosed) { - logger.Error(err, "Health server stopped unexpectedly") + logger.Error(serveErr, "Health server stopped unexpectedly") } } @@ -268,7 +303,7 @@ func (s *DeviceAPIServer) serveMetrics(ctx context.Context) { serveErr := metricsSrv.Serve(lis) if serveErr != nil && !errors.Is(serveErr, http.ErrServerClosed) && !errors.Is(serveErr, net.ErrClosed) { - logger.Error(err, "Metrics server stopped unexpectedly", "address", s.MetricsAddress) + logger.Error(serveErr, "Metrics server stopped unexpectedly", "address", s.MetricsAddress) } } @@ -277,48 +312,40 @@ func (s *DeviceAPIServer) waitForStorage(ctx context.Context) error { return fmt.Errorf("storage backend is not initialized") } - logger := klog.FromContext(ctx) - startTime := time.Now() - if s.Storage.IsReady() { return nil } - pollTicker := time.NewTicker(200 * time.Millisecond) - defer pollTicker.Stop() - - heartbeat := time.NewTicker(5 * time.Second) - defer heartbeat.Stop() - + logger := klog.FromContext(ctx) logger.Info("Waiting for storage backend to become ready") + startTime := time.Now() - for { - select { - case <-ctx.Done(): - return ctx.Err() - - case <-pollTicker.C: + err := wait.PollUntilContextTimeout(ctx, 200*time.Millisecond, 60*time.Second, true, + func(ctx context.Context) (bool, error) { if s.Storage.IsReady() { logger.V(2).Info("Storage backend is ready", "duration", time.Since(startTime).Round(time.Second)) - return nil + return true, nil } - case <-heartbeat.C: - logger.V(2).Info("Still waiting for storage backend", - "elapsed", time.Since(startTime).Round(time.Second)) - } + return false, nil + }, + ) + if err != nil { + return fmt.Errorf("timed out waiting for storage backend readiness: %w", err) } + + return nil } func (s *DeviceAPIServer) installAPIServices(ctx context.Context) error { logger := klog.FromContext(ctx) var services []api.Service - for _, sp := range s.ServiceProviders { + for i, sp := range s.ServiceProviders { service, err := sp.Install(s.DeviceServer, s.Storage.StorageConfig) if err != nil { - return fmt.Errorf("failed to install API service: %w", err) + return fmt.Errorf("failed to install API service (index %d): %w", i, err) } services = append(services, service) diff --git a/pkg/grpc/client/client_conn.go b/pkg/grpc/client/client_conn.go index 1563e6d6a..5a19b3810 100644 --- a/pkg/grpc/client/client_conn.go +++ b/pkg/grpc/client/client_conn.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ package client import ( "fmt" + "strings" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" @@ -39,6 +40,15 @@ func ClientConnFor(config *Config, opts ...DialOption) (*grpc.ClientConn, error) return nil, err } + // Insecure credentials are only safe over Unix domain sockets. + // TLS is required for non-UDS targets (dns:, passthrough:). + if !strings.HasPrefix(cfg.Target, "unix://") && !strings.HasPrefix(cfg.Target, "unix:") { + return nil, fmt.Errorf( + "insecure credentials require unix:// target, got %q; TLS is required for non-UDS targets", + cfg.Target, + ) + } + logger := cfg.GetLogger() grpcOpts := []grpc.DialOption{ diff --git a/pkg/grpc/client/client_conn_test.go b/pkg/grpc/client/client_conn_test.go index 18f9d1864..b8589fad4 100644 --- a/pkg/grpc/client/client_conn_test.go +++ b/pkg/grpc/client/client_conn_test.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ package client import ( + "strings" "testing" "github.com/go-logr/logr" @@ -54,4 +55,18 @@ func TestClientConnFor(t *testing.T) { } conn.Close() }) + + t.Run("Rejects non-unix target with insecure credentials", func(t *testing.T) { + cfg := &Config{ + Target: "dns:///localhost:8080", + UserAgent: "test/1.0", + } + _, err := ClientConnFor(cfg) + if err == nil { + t.Fatal("expected error for non-unix target with insecure credentials") + } + if !strings.Contains(err.Error(), "insecure credentials require unix://") { + t.Errorf("unexpected error message: %v", err) + } + }) } diff --git a/pkg/grpc/client/config.go b/pkg/grpc/client/config.go index 308e72bef..1697845df 100644 --- a/pkg/grpc/client/config.go +++ b/pkg/grpc/client/config.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -17,10 +17,11 @@ package client import ( "fmt" "os" + "strings" "time" "github.com/go-logr/logr" - "github.com/nvidia/nvsentinel/pkg/util/version" + "github.com/nvidia/nvsentinel/pkg/version" ) const ( @@ -73,6 +74,12 @@ func (c *Config) Validate() error { return fmt.Errorf("gRPC target address is required; verify %s is not empty", NvidiaDeviceAPITargetEnvVar) } + // Validate target scheme + if !strings.HasPrefix(c.Target, "unix://") && !strings.HasPrefix(c.Target, "unix:") && + !strings.HasPrefix(c.Target, "dns:") && !strings.HasPrefix(c.Target, "passthrough:") { + return fmt.Errorf("gRPC target %q must use unix://, dns:, or passthrough: scheme", c.Target) + } + if c.UserAgent == "" { return fmt.Errorf("user-agent cannot be empty") } diff --git a/pkg/grpc/client/config_test.go b/pkg/grpc/client/config_test.go index 8cb550ed3..048b54e13 100644 --- a/pkg/grpc/client/config_test.go +++ b/pkg/grpc/client/config_test.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -86,13 +86,53 @@ func TestConfig_Validate(t *testing.T) { wantErr bool }{ { - name: "Valid config", + name: "Valid unix:/// config", cfg: Config{ Target: "unix:///var/run/test.sock", UserAgent: "test/1.0", }, wantErr: false, }, + { + name: "Valid unix: config", + cfg: Config{ + Target: "unix:/var/run/test.sock", + UserAgent: "test/1.0", + }, + wantErr: false, + }, + { + name: "Valid dns: config", + cfg: Config{ + Target: "dns:///localhost:8080", + UserAgent: "test/1.0", + }, + wantErr: false, + }, + { + name: "Valid passthrough: config", + cfg: Config{ + Target: "passthrough:///localhost:8080", + UserAgent: "test/1.0", + }, + wantErr: false, + }, + { + name: "Rejects http scheme", + cfg: Config{ + Target: "http://evil.com", + UserAgent: "test/1.0", + }, + wantErr: true, + }, + { + name: "Rejects bare hostname", + cfg: Config{ + Target: "somehost:1234", + UserAgent: "test/1.0", + }, + wantErr: true, + }, { name: "Missing target", cfg: Config{ diff --git a/pkg/grpc/client/interceptors.go b/pkg/grpc/client/interceptors.go index 796a34e50..c8e9e391c 100644 --- a/pkg/grpc/client/interceptors.go +++ b/pkg/grpc/client/interceptors.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -46,7 +46,8 @@ func NewLatencyUnaryInterceptor(logger logr.Logger) grpc.UnaryClientInterceptor return err } - logger.Error(err, "RPC failed", kv...) + logger.V(4).Info("RPC error details", "error", err) + logger.Error(nil, "RPC failed", kv...) return err } @@ -81,7 +82,8 @@ func NewLatencyStreamInterceptor(logger logr.Logger) grpc.StreamClientIntercepto return stream, err } - logger.Error(err, "Stream establishment failed", kv...) + logger.V(4).Info("Stream error details", "error", err) + logger.Error(nil, "Stream establishment failed", kv...) return stream, err } diff --git a/pkg/grpc/client/watcher.go b/pkg/grpc/client/watcher.go index 5972ef536..f688d550e 100644 --- a/pkg/grpc/client/watcher.go +++ b/pkg/grpc/client/watcher.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -17,8 +17,10 @@ package client import ( "context" "errors" + "fmt" "io" "sync" + "time" "github.com/go-logr/logr" "google.golang.org/grpc/codes" @@ -64,17 +66,16 @@ func NewWatcher( return w } -// Stop cancels the context and closes the event source. +// Stop signals the receive loop to exit, cancels the context, and closes the event source. func (w *Watcher) Stop() { w.stopOnce.Do(func() { w.logger.V(4).Info("Stopping watcher") - w.cancel() + close(w.done) // Signal receive loop to exit first + w.cancel() // Cancel the context if err := w.source.Close(); err != nil { w.logger.V(4).Info("Error closing source during stop", "err", err) } - - close(w.done) }) } @@ -125,7 +126,7 @@ func (w *Watcher) receive() { return default: - w.logger.V(2).Info("Skipping unknown event type from server", "rawType", typeStr) + w.logger.V(1).Info("Skipping unknown event type from server", "rawType", typeStr) continue } @@ -141,17 +142,26 @@ func (w *Watcher) receive() { "resourceVersion", meta.GetResourceVersion(), ) } + case <-time.After(30 * time.Second): + w.logger.Error(nil, "Event send timed out; consumer not reading, stopping watcher") + return } } } func (w *Watcher) sendError(err error) { st := status.Convert(err) - code := st.Code() + + // Log full error details at debug level only + w.logger.V(4).Info("Watch stream error", + "code", code, + "serverMessage", st.Message(), + ) + statusErr := &metav1.Status{ Status: metav1.StatusFailure, - Message: st.Message(), + Message: fmt.Sprintf("watch stream error: %s", code.String()), Code: int32(code), // #nosec G115 } @@ -181,5 +191,7 @@ func (w *Watcher) sendError(err error) { case <-w.done: w.logger.V(4).Info("Watcher already done, dropping error event") case w.result <- watch.Event{Type: watch.Error, Object: statusErr}: + case <-time.After(5 * time.Second): + w.logger.V(2).Info("Error event send timed out, dropping") } } diff --git a/pkg/providers/nvml/enumerator.go b/pkg/providers/nvml/enumerator.go new file mode 100644 index 000000000..f1ac61b38 --- /dev/null +++ b/pkg/providers/nvml/enumerator.go @@ -0,0 +1,199 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build nvml + +package nvml + +import ( + "fmt" + "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1" +) + +// enumerateDevices discovers all GPUs via NVML and registers them via gRPC. +// +// For each GPU found, it extracts device information and creates a GPU entry +// via the GpuService API with an initial "NVMLReady" condition set to True. +// +// Returns the number of GPUs discovered. +func (p *Provider) enumerateDevices() (int, error) { + count, ret := p.nvmllib.DeviceGetCount() + if ret != nvml.SUCCESS { + return 0, fmt.Errorf("failed to get device count: %v", nvml.ErrorString(ret)) + } + + if count == 0 { + p.logger.Info("No GPUs found on this node") + return 0, nil + } + + p.logger.V(1).Info("Enumerating GPUs", "count", count) + + successCount := 0 + uuids := make([]string, 0, count) + + for i := 0; i < count; i++ { + device, ret := p.nvmllib.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + p.logger.Error(nil, "Failed to get device handle", "index", i, "error", nvml.ErrorString(ret)) + + continue + } + + gpu, productName, memoryBytes, err := p.deviceToGpu(i, device) + if err != nil { + p.logger.Error(err, "Failed to get GPU info", "index", i) + + continue + } + + // Register GPU via typed client (Create is idempotent -- returns existing GPU if already registered) + _, err = p.client.Create(p.ctx, gpu, metav1.CreateOptions{}) + if err != nil { + p.logger.Error(err, "Failed to create GPU via gRPC", "uuid", gpu.Name) + + continue + } + + // Track UUID for health monitoring + uuids = append(uuids, gpu.Name) + + p.logger.Info("GPU registered", + "uuid", gpu.Name, + "productName", productName, + "memory", FormatBytes(memoryBytes), + ) + + successCount++ + } + + // Assign tracked UUIDs atomically (caller holds p.mu) + p.gpuUUIDs = uuids + + return successCount, nil +} + +// deviceToGpu extracts GPU information from an NVML device handle. +// Returns the GPU object, product name, and memory bytes (for logging). +func (p *Provider) deviceToGpu(index int, device Device) (*devicev1alpha1.GPU, string, uint64, error) { + // Get UUID (required) + uuid, ret := device.GetUUID() + if ret != nvml.SUCCESS { + return nil, "", 0, fmt.Errorf("failed to get UUID: %v", nvml.ErrorString(ret)) + } + + // Get memory info (for logging) + var memoryBytes uint64 + + memInfo, ret := device.GetMemoryInfo() + if ret == nvml.SUCCESS { + memoryBytes = memInfo.Total + } + + // Get product name (for logging) + productName, ret := device.GetName() + if ret != nvml.SUCCESS { + productName = "Unknown" + } + + // Build GPU object using K8s-native types + now := metav1.Now() + gpu := &devicev1alpha1.GPU{ + ObjectMeta: metav1.ObjectMeta{ + Name: uuid, + }, + Spec: devicev1alpha1.GPUSpec{ + UUID: uuid, + }, + Status: devicev1alpha1.GPUStatus{ + Conditions: []metav1.Condition{ + { + Type: ConditionTypeNVMLReady, + Status: metav1.ConditionStatus(ConditionStatusTrue), + Reason: "Initialized", + Message: fmt.Sprintf("GPU enumerated via NVML: %s (%s)", productName, FormatBytes(memoryBytes)), + LastTransitionTime: now, + }, + }, + }, + } + + return gpu, productName, memoryBytes, nil +} + +// UpdateCondition updates a single condition on a GPU via the typed client. +// +// This method: +// 1. Gets the current GPU state +// 2. Updates/adds the condition in the status +// 3. Sends the updated status via UpdateStatus (status subresource) +// +// The condition's LastTransitionTime is set to the current time. +func (p *Provider) UpdateCondition( + uuid string, + conditionType string, + conditionStatus string, + reason, message string, +) error { + // Get current GPU state + gpu, err := p.client.Get(p.ctx, uuid, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get GPU %s: %w", uuid, err) + } + + if gpu == nil { + return fmt.Errorf("Get returned nil for %s", uuid) + } + + // Build the new condition + condition := metav1.Condition{ + Type: conditionType, + Status: metav1.ConditionStatus(conditionStatus), + Reason: reason, + Message: message, + LastTransitionTime: metav1.NewTime(time.Now()), + } + + // Find and replace existing condition, or append + found := false + for i, existing := range gpu.Status.Conditions { + if existing.Type == conditionType { + gpu.Status.Conditions[i] = condition + found = true + break + } + } + if !found { + gpu.Status.Conditions = append(gpu.Status.Conditions, condition) + } + + // Cap conditions to prevent unbounded growth + const maxConditions = 100 + if len(gpu.Status.Conditions) > maxConditions { + gpu.Status.Conditions = gpu.Status.Conditions[len(gpu.Status.Conditions)-maxConditions:] + } + + // Update the GPU status via the status subresource + _, err = p.client.UpdateStatus(p.ctx, gpu, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update GPU status %s: %w", uuid, err) + } + + return nil +} diff --git a/pkg/providers/nvml/health_monitor.go b/pkg/providers/nvml/health_monitor.go new file mode 100644 index 000000000..5169b3d79 --- /dev/null +++ b/pkg/providers/nvml/health_monitor.go @@ -0,0 +1,282 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build nvml + +package nvml + +import ( + "fmt" + "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// HealthMonitor monitors GPU health via NVML events. +type HealthMonitor struct { + provider *Provider +} + +// EventTimeout is the timeout for NVML event wait (in milliseconds). +const EventTimeout = 5000 + +// unknownUUID is used when UUID cannot be retrieved. +const unknownUUID = "unknown" + +// startHealthMonitoring initializes and starts XID event monitoring. +func (p *Provider) startHealthMonitoring() error { + // Create event set + eventSet, ret := p.nvmllib.EventSetCreate() + if ret != nvml.SUCCESS { + return fmt.Errorf("failed to create event set: %v", nvml.ErrorString(ret)) + } + + p.eventSet = eventSet + + // Register for health events on all GPUs + eventMask := uint64( + nvml.EventTypeXidCriticalError | + nvml.EventTypeDoubleBitEccError | + nvml.EventTypeSingleBitEccError, + ) + + count, ret := p.nvmllib.DeviceGetCount() + if ret != nvml.SUCCESS { + _ = p.eventSet.Free() + p.eventSet = nil + return fmt.Errorf("failed to get device count for health monitoring: %v", nvml.ErrorString(ret)) + } + + registeredCount := 0 + + for i := 0; i < count; i++ { + device, ret := p.nvmllib.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + continue + } + + uuid, ret := device.GetUUID() + if ret != nvml.SUCCESS { + p.logger.V(1).Info("Failed to get device UUID for health monitoring, skipping", + "index", i, + "error", nvml.ErrorString(ret), + ) + continue + } + + // Get supported events for this device + supportedEvents, ret := device.GetSupportedEventTypes() + if ret != nvml.SUCCESS { + p.logger.V(1).Info("Device does not support event queries", + "index", i, + "uuid", uuid, + "error", nvml.ErrorString(ret), + ) + + continue + } + + // Register only supported events + eventsToRegister := eventMask & supportedEvents + if eventsToRegister == 0 { + p.logger.V(1).Info("Device does not support any health events", + "index", i, + "uuid", uuid, + ) + + continue + } + + ret = device.RegisterEvents(eventsToRegister, p.eventSet.Raw()) + if ret == nvml.ERROR_NOT_SUPPORTED { + p.logger.V(1).Info("Device too old for health monitoring", + "index", i, + "uuid", uuid, + ) + + continue + } + + if ret != nvml.SUCCESS { + p.logger.Error(nil, "Failed to register events", + "index", i, + "uuid", uuid, + "error", nvml.ErrorString(ret), + ) + + continue + } + + registeredCount++ + + p.logger.V(2).Info("Registered health events", + "index", i, + "uuid", uuid, + "events", eventsToRegister, + ) + } + + if registeredCount == 0 { + _ = p.eventSet.Free() + p.eventSet = nil + + return fmt.Errorf("no devices support health event monitoring") + } + + p.logger.Info("Starting health monitoring", "devices", registeredCount) + + // Create health monitor + p.healthMonitor = &HealthMonitor{provider: p} + + // Start monitoring goroutine + p.wg.Add(1) + + go p.runHealthMonitor() + + p.monitorRunning = true + + return nil +} + +// runHealthMonitor is the main health monitoring loop. +// +// The loop checks for context cancellation before each iteration to ensure +// prompt shutdown when requested. The processEvents() call blocks for up to +// EventTimeout milliseconds waiting for NVML events. +func (p *Provider) runHealthMonitor() { + defer p.wg.Done() + + p.logger.V(1).Info("Health monitor started") + + for { + // Check for shutdown before processing events. + // This ensures we respond promptly to cancellation rather than + // waiting for the next event timeout cycle. + select { + case <-p.ctx.Done(): + p.logger.V(1).Info("Health monitor stopping") + return + default: + } + + p.processEvents() + } +} + +// processEvents waits for and processes NVML events. +func (p *Provider) processEvents() { + event, ret := p.eventSet.Wait(EventTimeout) + + if ret == nvml.ERROR_TIMEOUT { + // Normal timeout, continue + return + } + + if ret != nvml.SUCCESS { + if ret == nvml.ERROR_GPU_IS_LOST { + p.logger.Error(nil, "GPU lost detected, marking all GPUs unhealthy") + p.markAllUnhealthy("GPULost", "GPU is lost error detected") + + return + } + + p.logger.V(2).Info("Error waiting for event", + "error", nvml.ErrorString(ret), + ) + + // Brief sleep to avoid tight loop on persistent errors + time.Sleep(100 * time.Millisecond) + + return + } + + // Process the event + p.handleEvent(event) +} + +// handleEvent processes a single NVML event. +func (p *Provider) handleEvent(event nvml.EventData) { + eventType := event.EventType + xid := event.EventData + gpuInstanceID := event.GpuInstanceId + computeInstanceID := event.ComputeInstanceId + + // Get UUID for logging + uuid := unknownUUID + + if event.Device != nil { + if u, ret := event.Device.GetUUID(); ret == nvml.SUCCESS { + uuid = u + } + } + + // Only process XID critical errors for health changes + if eventType != nvml.EventTypeXidCriticalError { + p.logger.V(2).Info("Non-critical event received", + "uuid", uuid, + "eventType", eventType, + "xid", xid, + ) + + return + } + + // Check if this XID should be ignored + if isIgnoredXid(xid, p.additionalIgnoredXids) { + p.logger.V(2).Info("Ignoring non-critical XID", + "uuid", uuid, + "xid", xid, + "gpuInstanceId", gpuInstanceID, + "computeInstanceId", computeInstanceID, + ) + + return + } + + // Critical XID - mark GPU unhealthy + p.logger.Info("Critical XID error detected", + "uuid", uuid, + "xid", xid, + "xidName", xidToString(xid), + "gpuInstanceId", gpuInstanceID, + "computeInstanceId", computeInstanceID, + ) + + message := fmt.Sprintf("Critical XID error %d (%s) detected", xid, xidToString(xid)) + if err := p.UpdateCondition(uuid, ConditionTypeNVMLReady, ConditionStatusFalse, "XidError", message); err != nil { + p.logger.Error(err, "Failed to update GPU condition", "uuid", uuid) + } +} + +// markAllUnhealthy marks all tracked GPUs as unhealthy. +func (p *Provider) markAllUnhealthy(reason, message string) { + p.mu.RLock() + uuids := make([]string, len(p.gpuUUIDs)) + copy(uuids, p.gpuUUIDs) + p.mu.RUnlock() + + for _, uuid := range uuids { + err := p.UpdateCondition(uuid, ConditionTypeNVMLReady, ConditionStatusFalse, reason, message) + if err != nil { + p.logger.Error(err, "Failed to mark GPU unhealthy", "uuid", uuid) + } + } +} + +// MarkHealthy marks a specific GPU as healthy. +// +// This can be called to restore a GPU's health status after recovery. +func (p *Provider) MarkHealthy(uuid string) error { + return p.UpdateCondition(uuid, ConditionTypeNVMLReady, ConditionStatusTrue, "Healthy", "GPU is healthy") +} diff --git a/pkg/providers/nvml/interface.go b/pkg/providers/nvml/interface.go new file mode 100644 index 000000000..5b534b154 --- /dev/null +++ b/pkg/providers/nvml/interface.go @@ -0,0 +1,143 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build nvml + +package nvml + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// Library is the interface for NVML library operations. +// This interface contains only the methods used by the Provider, +// making it easier to mock for testing. +type Library interface { + Init() nvml.Return + Shutdown() nvml.Return + SystemGetDriverVersion() (string, nvml.Return) + DeviceGetCount() (int, nvml.Return) + DeviceGetHandleByIndex(index int) (Device, nvml.Return) + EventSetCreate() (EventSet, nvml.Return) +} + +// Device is the interface for NVML device operations. +type Device interface { + GetUUID() (string, nvml.Return) + GetName() (string, nvml.Return) + GetMemoryInfo() (nvml.Memory, nvml.Return) + GetRetiredPagesPendingStatus() (nvml.EnableState, nvml.Return) + GetSupportedEventTypes() (uint64, nvml.Return) + RegisterEvents(eventTypes uint64, set nvml.EventSet) nvml.Return +} + +// EventSet is the interface for NVML event set operations. +type EventSet interface { + Wait(timeout uint32) (nvml.EventData, nvml.Return) + Free() nvml.Return + // Raw returns the underlying nvml.EventSet for use with RegisterEvents. + Raw() nvml.EventSet +} + +// nvmlLibraryWrapper wraps the real nvml.Interface to implement Library. +type nvmlLibraryWrapper struct { + lib nvml.Interface +} + +// NewLibraryWrapper creates a Library wrapper around an nvml.Interface. +func NewLibraryWrapper(lib nvml.Interface) Library { + return &nvmlLibraryWrapper{lib: lib} +} + +func (w *nvmlLibraryWrapper) Init() nvml.Return { + return w.lib.Init() +} + +func (w *nvmlLibraryWrapper) Shutdown() nvml.Return { + return w.lib.Shutdown() +} + +func (w *nvmlLibraryWrapper) SystemGetDriverVersion() (string, nvml.Return) { + return w.lib.SystemGetDriverVersion() +} + +func (w *nvmlLibraryWrapper) DeviceGetCount() (int, nvml.Return) { + return w.lib.DeviceGetCount() +} + +func (w *nvmlLibraryWrapper) DeviceGetHandleByIndex(index int) (Device, nvml.Return) { + device, ret := w.lib.DeviceGetHandleByIndex(index) + if ret != nvml.SUCCESS { + return nil, ret + } + + return &nvmlDeviceWrapper{device: device}, ret +} + +func (w *nvmlLibraryWrapper) EventSetCreate() (EventSet, nvml.Return) { + es, ret := w.lib.EventSetCreate() + if ret != nvml.SUCCESS { + return nil, ret + } + + return &nvmlEventSetWrapper{es: es}, ret +} + +// nvmlDeviceWrapper wraps nvml.Device to implement Device. +type nvmlDeviceWrapper struct { + device nvml.Device +} + +func (w *nvmlDeviceWrapper) GetUUID() (string, nvml.Return) { + return w.device.GetUUID() +} + +func (w *nvmlDeviceWrapper) GetName() (string, nvml.Return) { + return w.device.GetName() +} + +func (w *nvmlDeviceWrapper) GetMemoryInfo() (nvml.Memory, nvml.Return) { + return w.device.GetMemoryInfo() +} + +func (w *nvmlDeviceWrapper) GetRetiredPagesPendingStatus() (nvml.EnableState, nvml.Return) { + return w.device.GetRetiredPagesPendingStatus() +} + +func (w *nvmlDeviceWrapper) GetSupportedEventTypes() (uint64, nvml.Return) { + return w.device.GetSupportedEventTypes() +} + +func (w *nvmlDeviceWrapper) RegisterEvents(eventTypes uint64, set nvml.EventSet) nvml.Return { + return w.device.RegisterEvents(eventTypes, set) +} + +// nvmlEventSetWrapper wraps nvml.EventSet to implement EventSet. +type nvmlEventSetWrapper struct { + es nvml.EventSet +} + +func (w *nvmlEventSetWrapper) Wait(timeout uint32) (nvml.EventData, nvml.Return) { + return w.es.Wait(timeout) +} + +func (w *nvmlEventSetWrapper) Free() nvml.Return { + return w.es.Free() +} + +// Raw returns the underlying nvml.EventSet for use with device.RegisterEvents. +// This is needed because RegisterEvents expects the concrete nvml.EventSet type. +func (w *nvmlEventSetWrapper) Raw() nvml.EventSet { + return w.es +} diff --git a/pkg/providers/nvml/mock_test.go b/pkg/providers/nvml/mock_test.go new file mode 100644 index 000000000..05785ae64 --- /dev/null +++ b/pkg/providers/nvml/mock_test.go @@ -0,0 +1,245 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build nvml + +package nvml + +import ( + "sync" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// MockLibrary is a mock implementation of Library for testing. +type MockLibrary struct { + // Init behavior + InitReturn nvml.Return + + // Shutdown behavior + ShutdownReturn nvml.Return + + // SystemGetDriverVersion behavior + DriverVersion string + DriverVersionReturn nvml.Return + + // DeviceGetCount behavior + DeviceCount int + DeviceCountReturn nvml.Return + + // Devices returns mock devices by index + Devices map[int]*MockDevice + + // EventSetCreate behavior + EventSet *MockEventSet + EventSetCreateReturn nvml.Return + + // Track calls for verification + mu sync.Mutex + InitCalled bool + ShutdownCalled bool +} + +// NewMockLibrary creates a new mock Library with defaults. +func NewMockLibrary() *MockLibrary { + return &MockLibrary{ + InitReturn: nvml.SUCCESS, + ShutdownReturn: nvml.SUCCESS, + DriverVersion: "535.104.05", + DriverVersionReturn: nvml.SUCCESS, + DeviceCount: 0, + DeviceCountReturn: nvml.SUCCESS, + Devices: make(map[int]*MockDevice), + EventSetCreateReturn: nvml.SUCCESS, + } +} + +// AddDevice adds a mock device at the specified index. +func (m *MockLibrary) AddDevice(index int, device *MockDevice) { + m.Devices[index] = device + m.DeviceCount = len(m.Devices) +} + +// Init implements Library. +func (m *MockLibrary) Init() nvml.Return { + m.mu.Lock() + defer m.mu.Unlock() + m.InitCalled = true + + return m.InitReturn +} + +// Shutdown implements Library. +func (m *MockLibrary) Shutdown() nvml.Return { + m.mu.Lock() + defer m.mu.Unlock() + m.ShutdownCalled = true + + return m.ShutdownReturn +} + +// SystemGetDriverVersion implements Library. +func (m *MockLibrary) SystemGetDriverVersion() (string, nvml.Return) { + return m.DriverVersion, m.DriverVersionReturn +} + +// DeviceGetCount implements Library. +func (m *MockLibrary) DeviceGetCount() (int, nvml.Return) { + return m.DeviceCount, m.DeviceCountReturn +} + +// DeviceGetHandleByIndex implements Library. +func (m *MockLibrary) DeviceGetHandleByIndex(index int) (Device, nvml.Return) { + if device, ok := m.Devices[index]; ok { + return device, nvml.SUCCESS + } + + return nil, nvml.ERROR_NOT_FOUND +} + +// EventSetCreate implements Library. +func (m *MockLibrary) EventSetCreate() (EventSet, nvml.Return) { + if m.EventSet == nil { + m.EventSet = NewMockEventSet() + } + + return m.EventSet, m.EventSetCreateReturn +} + +// MockDevice is a mock implementation of Device. +type MockDevice struct { + UUID string + UUIDReturn nvml.Return + Name string + NameReturn nvml.Return + MemoryInfo nvml.Memory + MemoryInfoReturn nvml.Return + RetiredPagesPending nvml.EnableState + RetiredPagesPendingReturn nvml.Return + SupportedEvents uint64 + SupportedEventsReturn nvml.Return + RegisterEventsReturn nvml.Return +} + +// NewMockDevice creates a new mock device with sensible defaults. +func NewMockDevice(uuid, name string) *MockDevice { + return &MockDevice{ + UUID: uuid, + UUIDReturn: nvml.SUCCESS, + Name: name, + NameReturn: nvml.SUCCESS, + MemoryInfo: nvml.Memory{ + Total: 16 * 1024 * 1024 * 1024, // 16 GB + Free: 15 * 1024 * 1024 * 1024, + Used: 1 * 1024 * 1024 * 1024, + }, + MemoryInfoReturn: nvml.SUCCESS, + SupportedEvents: uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError), + SupportedEventsReturn: nvml.SUCCESS, + RegisterEventsReturn: nvml.SUCCESS, + } +} + +// GetUUID implements Device. +func (d *MockDevice) GetUUID() (string, nvml.Return) { + return d.UUID, d.UUIDReturn +} + +// GetName implements Device. +func (d *MockDevice) GetName() (string, nvml.Return) { + return d.Name, d.NameReturn +} + +// GetMemoryInfo implements Device. +func (d *MockDevice) GetMemoryInfo() (nvml.Memory, nvml.Return) { + return d.MemoryInfo, d.MemoryInfoReturn +} + +// GetRetiredPagesPendingStatus implements Device. +func (d *MockDevice) GetRetiredPagesPendingStatus() (nvml.EnableState, nvml.Return) { + return d.RetiredPagesPending, d.RetiredPagesPendingReturn +} + +// GetSupportedEventTypes implements Device. +func (d *MockDevice) GetSupportedEventTypes() (uint64, nvml.Return) { + return d.SupportedEvents, d.SupportedEventsReturn +} + +// RegisterEvents implements Device. +func (d *MockDevice) RegisterEvents(_ uint64, _ nvml.EventSet) nvml.Return { + return d.RegisterEventsReturn +} + +// MockEventSet is a mock implementation of EventSet. +type MockEventSet struct { + mu sync.Mutex + events []nvml.EventData + eventIdx int + WaitReturn nvml.Return + FreeReturn nvml.Return + Freed bool +} + +// NewMockEventSet creates a new mock event set. +func NewMockEventSet() *MockEventSet { + return &MockEventSet{ + events: make([]nvml.EventData, 0), + WaitReturn: nvml.ERROR_TIMEOUT, + FreeReturn: nvml.SUCCESS, + } +} + +// AddEvent adds an event to be returned by Wait. +func (e *MockEventSet) AddEvent(event nvml.EventData) { + e.mu.Lock() + defer e.mu.Unlock() + e.events = append(e.events, event) +} + +// Wait implements EventSet. +func (e *MockEventSet) Wait(_ uint32) (nvml.EventData, nvml.Return) { + e.mu.Lock() + defer e.mu.Unlock() + + if e.eventIdx < len(e.events) { + event := e.events[e.eventIdx] + e.eventIdx++ + + return event, nvml.SUCCESS + } + + return nvml.EventData{}, e.WaitReturn +} + +// Free implements EventSet. +func (e *MockEventSet) Free() nvml.Return { + e.mu.Lock() + defer e.mu.Unlock() + e.Freed = true + + return e.FreeReturn +} + +// Raw implements EventSet - returns nil for mocks since we don't need real event set. +func (e *MockEventSet) Raw() nvml.EventSet { + return nil +} + +// Compile-time interface checks. +var ( + _ Library = (*MockLibrary)(nil) + _ Device = (*MockDevice)(nil) + _ EventSet = (*MockEventSet)(nil) +) + diff --git a/pkg/providers/nvml/provider.go b/pkg/providers/nvml/provider.go new file mode 100644 index 000000000..77c26bcd7 --- /dev/null +++ b/pkg/providers/nvml/provider.go @@ -0,0 +1,275 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build nvml + +// Package nvml provides a built-in NVML-based health provider for the Device API Server. +// +// This provider uses NVML (NVIDIA Management Library) to: +// - Enumerate GPUs on the node at startup +// - Monitor GPU health via XID error events +// - Provide baseline device information when no external providers are connected +// +// The provider requires the NVIDIA driver to be installed and NVML libraries to be +// accessible. When running in Kubernetes, this is typically achieved by using the +// "nvidia" RuntimeClass which injects the driver libraries via the NVIDIA Container +// Toolkit, without consuming GPU resources. +package nvml + +import ( + "context" + "fmt" + "sync" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "k8s.io/klog/v2" + + gpuclient "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned/typed/device/v1alpha1" +) + +// Provider is the built-in NVML-based health provider. +// +// It uses NVML to enumerate GPUs and monitor their health status. +// The provider is optional and gracefully degrades if NVML is unavailable. +// +// The provider communicates with the Device API Server via the gRPC client +// interface, making it a "dogfooding" client of its own API. This design: +// - Decouples the provider from server internals +// - Enables running the provider as a separate sidecar process +// - Validates the API from a provider's perspective +type Provider struct { + // Configuration + config Config + + // NVML library interface (uses our wrapper for testability) + nvmllib Library + + // Typed client to communicate with Device API Server + client gpuclient.GPUInterface + + // Logger + logger klog.Logger + + // Health monitoring + eventSet EventSet + healthMonitor *HealthMonitor + monitorRunning bool + + // Lifecycle management + mu sync.RWMutex + ctx context.Context + cancel context.CancelFunc + wg sync.WaitGroup + + // State + initialized bool + gpuCount int + + // Tracked GPU UUIDs for health monitoring + gpuUUIDs []string + + // Pre-computed map of additional ignored XIDs for O(1) lookup + additionalIgnoredXids map[uint64]bool +} + +// Config holds configuration for the NVML provider. +type Config struct { + // DriverRoot is the root path where NVIDIA driver libraries are located. + // Common values: + // - "/run/nvidia/driver" (container with CDI/RuntimeClass) + // - "/" (bare metal or host path mount) + DriverRoot string + + // AdditionalIgnoredXids is a list of additional XID error codes to ignore. + // These are added to the default list of ignored XIDs (application errors). + AdditionalIgnoredXids []uint64 + + // HealthCheckEnabled enables XID event monitoring for health checks. + // When disabled, only device enumeration is performed. + HealthCheckEnabled bool +} + +// DefaultConfig returns a Config with sensible defaults. +func DefaultConfig() Config { + return Config{ + DriverRoot: "/run/nvidia/driver", + AdditionalIgnoredXids: nil, + HealthCheckEnabled: true, + } +} + +// New creates a new NVML provider. +// +// The provider is not started until Start() is called. If NVML cannot be +// initialized (e.g., no driver installed), Start() will return an error +// but the server can continue without NVML support. +// +// The client parameter is a GPUInterface used to communicate with the +// Device API Server. This enables the provider to be either: +// - Co-located with the server (using a loopback connection) +// - Running as a separate sidecar process (using a network connection) +func New(cfg Config, client gpuclient.GPUInterface, logger klog.Logger) *Provider { + logger = logger.WithName("nvml-provider") + + // Find NVML library path + libraryPath := FindDriverLibrary(cfg.DriverRoot) + logger.V(2).Info("Using NVML library path", "path", libraryPath) + + // Create NVML interface with explicit library path + var rawLib nvml.Interface + if libraryPath != "" { + rawLib = nvml.New(nvml.WithLibraryPath(libraryPath)) + } else { + // Fall back to system default + rawLib = nvml.New() + } + + return &Provider{ + config: cfg, + nvmllib: NewLibraryWrapper(rawLib), + client: client, + logger: logger, + } +} + +// Start initializes NVML and enumerates GPUs. +// +// If health checking is enabled, it also starts the XID event monitoring +// goroutine. Returns an error if NVML cannot be initialized. +func (p *Provider) Start(ctx context.Context) error { + p.mu.Lock() + defer p.mu.Unlock() + + if p.initialized { + return fmt.Errorf("provider already started") + } + + p.logger.Info("Starting NVML provider") + + // Initialize NVML + ret := p.nvmllib.Init() + if ret != nvml.SUCCESS { + return fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret)) + } + + // Get driver version for logging + driverVersion, ret := p.nvmllib.SystemGetDriverVersion() + if ret == nvml.SUCCESS { + p.logger.Info("NVML initialized", "driverVersion", driverVersion) + } + + // Build map of additional ignored XIDs for O(1) lookup + if len(p.config.AdditionalIgnoredXids) > 0 { + p.additionalIgnoredXids = make(map[uint64]bool, len(p.config.AdditionalIgnoredXids)) + for _, xid := range p.config.AdditionalIgnoredXids { + p.additionalIgnoredXids[xid] = true + } + } + + // Set up context for lifecycle management (must be before enumerateDevices, + // which uses p.ctx for gRPC calls) + p.ctx, p.cancel = context.WithCancel(ctx) + + // Enumerate devices + count, err := p.enumerateDevices() + if err != nil { + p.cancel() + p.ctx = nil + p.cancel = nil + _ = p.nvmllib.Shutdown() + + return fmt.Errorf("failed to enumerate devices: %w", err) + } + + p.gpuCount = count + + p.logger.Info("Enumerated GPUs", "count", count) + + p.initialized = true + + // Start health monitoring if enabled and we have GPUs + if p.config.HealthCheckEnabled && count > 0 { + if err := p.startHealthMonitoring(); err != nil { + p.logger.Error(err, "Failed to start health monitoring, continuing without it") + // Don't fail - health monitoring is optional + } + } + + return nil +} + +// Stop shuts down the NVML provider. +// +// It stops health monitoring (if running) and shuts down NVML. +// This method is safe to call multiple times. +func (p *Provider) Stop() { + p.mu.Lock() + defer p.mu.Unlock() + + if !p.initialized { + return + } + + p.logger.Info("Stopping NVML provider") + + // Cancel context to stop health monitoring + if p.cancel != nil { + p.cancel() + } + + // Wait for health monitor to stop + p.wg.Wait() + + // Clean up event set + if p.eventSet != nil { + if ret := p.eventSet.Free(); ret != nvml.SUCCESS { + p.logger.V(1).Info("Failed to free event set", "error", nvml.ErrorString(ret)) + } + + p.eventSet = nil + } + + // Shutdown NVML + if ret := p.nvmllib.Shutdown(); ret != nvml.SUCCESS { + p.logger.V(1).Info("Failed to shutdown NVML", "error", nvml.ErrorString(ret)) + } + + p.initialized = false + p.monitorRunning = false + p.logger.Info("NVML provider stopped") +} + +// IsInitialized returns true if the provider has been successfully started. +func (p *Provider) IsInitialized() bool { + p.mu.Lock() + defer p.mu.Unlock() + + return p.initialized +} + +// GPUCount returns the number of GPUs discovered. +func (p *Provider) GPUCount() int { + p.mu.Lock() + defer p.mu.Unlock() + + return p.gpuCount +} + +// IsHealthMonitorRunning returns true if health monitoring is active. +func (p *Provider) IsHealthMonitorRunning() bool { + p.mu.Lock() + defer p.mu.Unlock() + + return p.monitorRunning +} diff --git a/pkg/providers/nvml/provider_test.go b/pkg/providers/nvml/provider_test.go new file mode 100644 index 000000000..244fc106b --- /dev/null +++ b/pkg/providers/nvml/provider_test.go @@ -0,0 +1,606 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build nvml + +package nvml + +import ( + "context" + "testing" + "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/klog/v2" + + devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1" + gpuclient "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned/typed/device/v1alpha1" + "github.com/nvidia/nvsentinel/pkg/testutil" +) + +// testLogger returns a test logger. +func testLogger() klog.Logger { + return klog.NewKlogr().WithName("test") +} + +// TestProvider_Start_Success tests successful provider initialization. +func TestProvider_Start_Success(t *testing.T) { + mockLib := NewMockLibrary() + mockLib.AddDevice(0, NewMockDevice("GPU-uuid-0", "NVIDIA A100")) + mockLib.AddDevice(1, NewMockDevice("GPU-uuid-1", "NVIDIA A100")) + + client := testutil.NewTestGPUTypedClient(t) + + provider := &Provider{ + config: DefaultConfig(), + nvmllib: mockLib, + client: client, + logger: testLogger(), + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + err := provider.Start(ctx) + if err != nil { + t.Fatalf("Start() failed: %v", err) + } + defer provider.Stop() + + // Verify NVML was initialized + if !mockLib.InitCalled { + t.Error("Init() was not called") + } + + // Verify GPUs were registered + gpuList, err := client.List(context.Background(), metav1.ListOptions{}) + if err != nil { + t.Fatalf("List failed: %v", err) + } + if len(gpuList.Items) != 2 { + t.Errorf("Expected 2 GPUs, got %d", len(gpuList.Items)) + } + + // Verify provider state + if !provider.IsInitialized() { + t.Error("Provider should be initialized") + } + + if provider.GPUCount() != 2 { + t.Errorf("Expected GPUCount() = 2, got %d", provider.GPUCount()) + } +} + +// TestProvider_Start_NVMLInitFails tests graceful handling of NVML init failure. +func TestProvider_Start_NVMLInitFails(t *testing.T) { + mockLib := NewMockLibrary() + mockLib.InitReturn = nvml.ERROR_LIBRARY_NOT_FOUND + + client := testutil.NewTestGPUTypedClient(t) + + provider := &Provider{ + config: DefaultConfig(), + nvmllib: mockLib, + client: client, + logger: testLogger(), + } + + ctx := context.Background() + err := provider.Start(ctx) + + if err == nil { + t.Fatal("Expected Start() to fail when NVML init fails") + } + + if provider.IsInitialized() { + t.Error("Provider should not be initialized after failure") + } +} + +// TestProvider_Start_NoGPUs tests handling of nodes without GPUs. +func TestProvider_Start_NoGPUs(t *testing.T) { + mockLib := NewMockLibrary() + mockLib.DeviceCount = 0 + + client := testutil.NewTestGPUTypedClient(t) + + provider := &Provider{ + config: DefaultConfig(), + nvmllib: mockLib, + client: client, + logger: testLogger(), + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + err := provider.Start(ctx) + if err != nil { + t.Fatalf("Start() failed: %v", err) + } + defer provider.Stop() + + if provider.GPUCount() != 0 { + t.Errorf("Expected 0 GPUs, got %d", provider.GPUCount()) + } + + // Health monitor should not be running with 0 GPUs + if provider.IsHealthMonitorRunning() { + t.Error("Health monitor should not run with 0 GPUs") + } +} + +// TestProvider_Start_AlreadyStarted tests double-start prevention. +func TestProvider_Start_AlreadyStarted(t *testing.T) { + mockLib := NewMockLibrary() + client := testutil.NewTestGPUTypedClient(t) + + provider := &Provider{ + config: DefaultConfig(), + nvmllib: mockLib, + client: client, + logger: testLogger(), + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // First start + err := provider.Start(ctx) + if err != nil { + t.Fatalf("First Start() failed: %v", err) + } + defer provider.Stop() + + // Second start should fail + err = provider.Start(ctx) + if err == nil { + t.Error("Second Start() should fail") + } +} + +// TestProvider_Stop tests provider shutdown. +func TestProvider_Stop(t *testing.T) { + mockLib := NewMockLibrary() + mockLib.AddDevice(0, NewMockDevice("GPU-uuid-0", "NVIDIA A100")) + + client := testutil.NewTestGPUTypedClient(t) + + provider := &Provider{ + config: DefaultConfig(), + nvmllib: mockLib, + client: client, + logger: testLogger(), + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + err := provider.Start(ctx) + if err != nil { + t.Fatalf("Start() failed: %v", err) + } + + // Stop the provider + provider.Stop() + + // Verify state + if provider.IsInitialized() { + t.Error("Provider should not be initialized after Stop()") + } + + if !mockLib.ShutdownCalled { + t.Error("NVML Shutdown() was not called") + } + + // Double stop should be safe + provider.Stop() +} + +// TestProvider_Stop_NotStarted tests Stop() on unstarted provider. +func TestProvider_Stop_NotStarted(t *testing.T) { + mockLib := NewMockLibrary() + client := testutil.NewTestGPUTypedClient(t) + + provider := &Provider{ + config: DefaultConfig(), + nvmllib: mockLib, + client: client, + logger: testLogger(), + } + + // Stop should be safe even if not started + provider.Stop() + + if mockLib.ShutdownCalled { + t.Error("Shutdown() should not be called if provider was never started") + } +} + +// TestProvider_DeviceEnumeration tests that devices are properly enumerated. +func TestProvider_DeviceEnumeration(t *testing.T) { + mockLib := NewMockLibrary() + + // Add devices with varying configurations + device0 := NewMockDevice("GPU-11111111-1111-1111-1111-111111111111", "NVIDIA H100") + device0.MemoryInfo = nvml.Memory{Total: 80 * 1024 * 1024 * 1024} // 80 GB + + device1 := NewMockDevice("GPU-22222222-2222-2222-2222-222222222222", "NVIDIA A100") + device1.MemoryInfo = nvml.Memory{Total: 40 * 1024 * 1024 * 1024} // 40 GB + + mockLib.AddDevice(0, device0) + mockLib.AddDevice(1, device1) + + client := testutil.NewTestGPUTypedClient(t) + + provider := &Provider{ + config: DefaultConfig(), + nvmllib: mockLib, + client: client, + logger: testLogger(), + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + err := provider.Start(ctx) + if err != nil { + t.Fatalf("Start() failed: %v", err) + } + defer provider.Stop() + + // Verify both devices are registered + gpuList, err := client.List(context.Background(), metav1.ListOptions{}) + if err != nil { + t.Fatalf("List failed: %v", err) + } + gpus := gpuList.Items + if len(gpus) != 2 { + t.Fatalf("Expected 2 GPUs, got %d", len(gpus)) + } + + // Verify GPU details + uuids := make(map[string]bool) + for _, gpu := range gpus { + uuids[gpu.Name] = true + + // Check initial condition + if len(gpu.Status.Conditions) == 0 { + t.Errorf("GPU %s has no conditions", gpu.Name) + continue + } + + cond := gpu.Status.Conditions[0] + if cond.Type != ConditionTypeNVMLReady { + t.Errorf("Expected condition type %s, got %s", ConditionTypeNVMLReady, cond.Type) + } + + if cond.Status != metav1.ConditionStatus(ConditionStatusTrue) { + t.Errorf("Expected condition status True, got %s", cond.Status) + } + } + + if !uuids["GPU-11111111-1111-1111-1111-111111111111"] { + t.Error("GPU-11111111... not found in cache") + } + + if !uuids["GPU-22222222-2222-2222-2222-222222222222"] { + t.Error("GPU-22222222... not found in cache") + } +} + +// TestProvider_DeviceEnumeration_PartialFailure tests handling of partial device failures. +func TestProvider_DeviceEnumeration_PartialFailure(t *testing.T) { + mockLib := NewMockLibrary() + + // First device is fine + mockLib.AddDevice(0, NewMockDevice("GPU-good", "NVIDIA A100")) + + // Second device fails UUID retrieval + device1 := NewMockDevice("GPU-bad", "NVIDIA A100") + device1.UUIDReturn = nvml.ERROR_UNKNOWN + mockLib.AddDevice(1, device1) + + // Third device is fine + mockLib.AddDevice(2, NewMockDevice("GPU-good-2", "NVIDIA A100")) + + client := testutil.NewTestGPUTypedClient(t) + + provider := &Provider{ + config: DefaultConfig(), + nvmllib: mockLib, + client: client, + logger: testLogger(), + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + err := provider.Start(ctx) + if err != nil { + t.Fatalf("Start() failed: %v", err) + } + defer provider.Stop() + + // Only 2 GPUs should be registered (one failed) + gpuList, err := client.List(context.Background(), metav1.ListOptions{}) + if err != nil { + t.Fatalf("List failed: %v", err) + } + if len(gpuList.Items) != 2 { + t.Errorf("Expected 2 GPUs (1 failed), got %d", len(gpuList.Items)) + } +} + +// TestProvider_HealthCheckDisabled tests that health monitoring can be disabled. +func TestProvider_HealthCheckDisabled(t *testing.T) { + mockLib := NewMockLibrary() + mockLib.AddDevice(0, NewMockDevice("GPU-uuid-0", "NVIDIA A100")) + + client := testutil.NewTestGPUTypedClient(t) + + config := DefaultConfig() + config.HealthCheckEnabled = false + + provider := &Provider{ + config: config, + nvmllib: mockLib, + client: client, + logger: testLogger(), + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + err := provider.Start(ctx) + if err != nil { + t.Fatalf("Start() failed: %v", err) + } + defer provider.Stop() + + // Give a moment for any goroutines to start + time.Sleep(10 * time.Millisecond) + + if provider.IsHealthMonitorRunning() { + t.Error("Health monitor should not be running when disabled") + } +} + +// TestProvider_UpdateCondition tests condition updates. +func TestProvider_UpdateCondition(t *testing.T) { + mockLib := NewMockLibrary() + mockLib.AddDevice(0, NewMockDevice("GPU-uuid-0", "NVIDIA A100")) + + client := testutil.NewTestGPUTypedClient(t) + + config := DefaultConfig() + config.HealthCheckEnabled = false + + provider := &Provider{ + config: config, + nvmllib: mockLib, + client: client, + logger: testLogger(), + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + err := provider.Start(ctx) + if err != nil { + t.Fatalf("Start() failed: %v", err) + } + defer provider.Stop() + + // Update condition to unhealthy + err = provider.UpdateCondition("GPU-uuid-0", ConditionTypeNVMLReady, ConditionStatusFalse, "XidError", "Critical XID 48") + if err != nil { + t.Fatalf("UpdateCondition() failed: %v", err) + } + + // Verify condition was updated + gpu, err := client.Get(context.Background(), "GPU-uuid-0", metav1.GetOptions{}) + if err != nil { + t.Fatalf("Get failed: %v", err) + } + + var foundCondition bool + + for _, cond := range gpu.Status.Conditions { + if cond.Type == ConditionTypeNVMLReady { + foundCondition = true + + if string(cond.Status) != ConditionStatusFalse { + t.Errorf("Expected status False, got %s", cond.Status) + } + + if cond.Reason != "XidError" { + t.Errorf("Expected reason XidError, got %s", cond.Reason) + } + } + } + + if !foundCondition { + t.Error("NVMLReady condition not found") + } +} + +// TestProvider_UpdateCondition_GPUNotFound tests condition update for non-existent GPU. +func TestProvider_UpdateCondition_GPUNotFound(t *testing.T) { + mockLib := NewMockLibrary() + client := testutil.NewTestGPUTypedClient(t) + + config := DefaultConfig() + config.HealthCheckEnabled = false + + provider := &Provider{ + config: config, + nvmllib: mockLib, + client: client, + logger: testLogger(), + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + err := provider.Start(ctx) + if err != nil { + t.Fatalf("Start() failed: %v", err) + } + defer provider.Stop() + + // Try to update condition for non-existent GPU + err = provider.UpdateCondition("GPU-nonexistent", ConditionTypeNVMLReady, ConditionStatusFalse, "XidError", "Test") + if err == nil { + t.Error("Expected error for non-existent GPU") + } +} + +// TestProvider_MarkHealthy tests marking a GPU as healthy. +func TestProvider_MarkHealthy(t *testing.T) { + mockLib := NewMockLibrary() + mockLib.AddDevice(0, NewMockDevice("GPU-uuid-0", "NVIDIA A100")) + + client := testutil.NewTestGPUTypedClient(t) + + config := DefaultConfig() + config.HealthCheckEnabled = false + + provider := &Provider{ + config: config, + nvmllib: mockLib, + client: client, + logger: testLogger(), + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + err := provider.Start(ctx) + if err != nil { + t.Fatalf("Start() failed: %v", err) + } + defer provider.Stop() + + // First mark as unhealthy + err = provider.UpdateCondition("GPU-uuid-0", ConditionTypeNVMLReady, ConditionStatusFalse, "XidError", "Test") + if err != nil { + t.Fatalf("UpdateCondition() failed: %v", err) + } + + // Then mark as healthy + err = provider.MarkHealthy("GPU-uuid-0") + if err != nil { + t.Fatalf("MarkHealthy() failed: %v", err) + } + + // Verify it's healthy + gpu, err := client.Get(context.Background(), "GPU-uuid-0", metav1.GetOptions{}) + if err != nil { + t.Fatalf("Get failed: %v", err) + } + + for _, cond := range gpu.Status.Conditions { + if cond.Type == ConditionTypeNVMLReady { + if string(cond.Status) != ConditionStatusTrue { + t.Errorf("Expected status True after MarkHealthy, got %s", cond.Status) + } + + return + } + } + + t.Error("NVMLReady condition not found") +} + +// contextCapturingClient wraps a GPUInterface and captures the context +// passed to Create. This allows tests to verify that enumerateDevices +// receives a non-nil context. +type contextCapturingClient struct { + inner gpuclient.GPUInterface + capturedCtx context.Context +} + +func newContextCapturingClient(inner gpuclient.GPUInterface) *contextCapturingClient { + return &contextCapturingClient{inner: inner} +} + +func (c *contextCapturingClient) Create(ctx context.Context, gpu *devicev1alpha1.GPU, opts metav1.CreateOptions) (*devicev1alpha1.GPU, error) { + c.capturedCtx = ctx + return c.inner.Create(ctx, gpu, opts) +} + +func (c *contextCapturingClient) Get(ctx context.Context, name string, opts metav1.GetOptions) (*devicev1alpha1.GPU, error) { + return c.inner.Get(ctx, name, opts) +} + +func (c *contextCapturingClient) Update(ctx context.Context, gpu *devicev1alpha1.GPU, opts metav1.UpdateOptions) (*devicev1alpha1.GPU, error) { + return c.inner.Update(ctx, gpu, opts) +} + +func (c *contextCapturingClient) UpdateStatus(ctx context.Context, gpu *devicev1alpha1.GPU, opts metav1.UpdateOptions) (*devicev1alpha1.GPU, error) { + return c.inner.UpdateStatus(ctx, gpu, opts) +} + +func (c *contextCapturingClient) List(ctx context.Context, opts metav1.ListOptions) (*devicev1alpha1.GPUList, error) { + return c.inner.List(ctx, opts) +} + +func (c *contextCapturingClient) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error { + return c.inner.Delete(ctx, name, opts) +} + +func (c *contextCapturingClient) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) { + return c.inner.Watch(ctx, opts) +} + +// TestProvider_Start_ContextSetBeforeEnumerate verifies that enumerateDevices +// receives a non-nil context. Before the fix, p.ctx was nil when +// enumerateDevices was called, which would cause a gRPC panic on any real +// gRPC client. +func TestProvider_Start_ContextSetBeforeEnumerate(t *testing.T) { + mockLib := NewMockLibrary() + mockLib.AddDevice(0, NewMockDevice("GPU-ctx-test", "NVIDIA A100")) + + typedClient := testutil.NewTestGPUTypedClient(t) + capturingClient := newContextCapturingClient(typedClient) + + provider := &Provider{ + config: Config{HealthCheckEnabled: false}, + nvmllib: mockLib, + client: capturingClient, + logger: testLogger(), + } + + ctx := context.Background() + err := provider.Start(ctx) + if err != nil { + t.Fatalf("Start() failed: %v", err) + } + defer provider.Stop() + + // The capturing client recorded the context passed to Create during + // enumerateDevices. If the fix is missing, this will be nil because p.ctx + // was not set before enumerateDevices was called. + if capturingClient.capturedCtx == nil { + t.Fatal("Create was called with nil context; p.ctx must be set before enumerateDevices()") + } + + // Also verify p.ctx is set after Start returns. + if provider.ctx == nil { + t.Fatal("p.ctx should be set after Start()") + } +} diff --git a/pkg/providers/nvml/shared.go b/pkg/providers/nvml/shared.go new file mode 100644 index 000000000..d33c58619 --- /dev/null +++ b/pkg/providers/nvml/shared.go @@ -0,0 +1,85 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nvml + +import ( + "fmt" + "os" + "path/filepath" +) + +// Condition constants for NVML provider. +const ( + // ConditionTypeNVMLReady is the condition type for NVML health status. + ConditionTypeNVMLReady = "NVMLReady" + + // ConditionSourceNVML is the source identifier for conditions set by NVML provider. + ConditionSourceNVML = "nvml-provider" + + // ConditionStatusTrue indicates the condition is met. + ConditionStatusTrue = "True" + + // ConditionStatusFalse indicates the condition is not met. + ConditionStatusFalse = "False" + + // ConditionStatusUnknown indicates the condition status is unknown. + ConditionStatusUnknown = "Unknown" +) + +// FormatBytes formats bytes to a human-readable string. +func FormatBytes(bytes uint64) string { + const ( + KB = 1024 + MB = KB * 1024 + GB = MB * 1024 + ) + + switch { + case bytes >= GB: + return fmt.Sprintf("%.1f GB", float64(bytes)/float64(GB)) + case bytes >= MB: + return fmt.Sprintf("%.1f MB", float64(bytes)/float64(MB)) + case bytes >= KB: + return fmt.Sprintf("%.1f KB", float64(bytes)/float64(KB)) + default: + return fmt.Sprintf("%d B", bytes) + } +} + +// FindDriverLibrary locates the NVML library in the driver root. +// +// It searches common paths where libnvidia-ml.so.1 might be located. +// Returns empty string if not found (will use system default). +func FindDriverLibrary(driverRoot string) string { + if driverRoot == "" { + return "" + } + + searchPaths := []string{ + filepath.Join(driverRoot, "usr/lib64/libnvidia-ml.so.1"), + filepath.Join(driverRoot, "usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1"), + filepath.Join(driverRoot, "usr/lib/libnvidia-ml.so.1"), + filepath.Join(driverRoot, "lib64/libnvidia-ml.so.1"), + filepath.Join(driverRoot, "lib/libnvidia-ml.so.1"), + } + + for _, path := range searchPaths { + if _, err := os.Stat(path); err == nil { + return path + } + } + + return "" +} diff --git a/pkg/providers/nvml/stub.go b/pkg/providers/nvml/stub.go new file mode 100644 index 000000000..c2b7baf7e --- /dev/null +++ b/pkg/providers/nvml/stub.go @@ -0,0 +1,80 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !nvml + +// Package nvml provides a built-in NVML-based health provider for the Device API Server. +// +// This stub file is used when NVML support is not compiled in (build without -tags=nvml). +package nvml + +import ( + "context" + "errors" + + "k8s.io/klog/v2" + + gpuclient "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned/typed/device/v1alpha1" +) + +// ErrNVMLNotCompiled is returned when NVML support is not compiled into the binary. +var ErrNVMLNotCompiled = errors.New("NVML support not compiled in (build with -tags=nvml)") + +// Provider is the built-in NVML-based health provider (stub when not compiled). +type Provider struct{} + +// Config holds configuration for the NVML provider. +type Config struct { + DriverRoot string + AdditionalIgnoredXids []uint64 + HealthCheckEnabled bool +} + +// DefaultConfig returns a Config with sensible defaults. +func DefaultConfig() Config { + return Config{ + DriverRoot: "/run/nvidia/driver", + AdditionalIgnoredXids: nil, + HealthCheckEnabled: true, + } +} + +// New creates a new NVML provider (stub). +func New(cfg Config, client gpuclient.GPUInterface, logger klog.Logger) *Provider { + return &Provider{} +} + +// Start initializes NVML (stub - always returns error). +func (p *Provider) Start(ctx context.Context) error { + return ErrNVMLNotCompiled +} + +// Stop shuts down the NVML provider (stub - no-op). +func (p *Provider) Stop() {} + +// IsInitialized returns false (stub). +func (p *Provider) IsInitialized() bool { + return false +} + +// GPUCount returns 0 (stub). +func (p *Provider) GPUCount() int { + return 0 +} + +// IsHealthMonitorRunning returns false (stub). +func (p *Provider) IsHealthMonitorRunning() bool { + return false +} + diff --git a/pkg/providers/nvml/xid.go b/pkg/providers/nvml/xid.go new file mode 100644 index 000000000..718bb3814 --- /dev/null +++ b/pkg/providers/nvml/xid.go @@ -0,0 +1,213 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nvml + +import ( + "strconv" + "strings" +) + +// XID errors documentation: +// https://docs.nvidia.com/deploy/xid-errors/index.html + +// defaultIgnoredXids contains XID error codes that are typically caused by +// application errors rather than hardware failures. These are ignored by +// default to avoid false positives in health monitoring. +// +// Reference: https://docs.nvidia.com/deploy/xid-errors/index.html#topic_4 +var defaultIgnoredXids = map[uint64]bool{ + // Application errors - GPU should still be healthy + 13: true, // Graphics Engine Exception + 31: true, // GPU memory page fault + 43: true, // GPU stopped processing + 45: true, // Preemptive cleanup, due to previous errors + 68: true, // Video processor exception + 109: true, // Context Switch Timeout Error +} + +// criticalXids contains XID error codes that indicate critical hardware +// failures requiring immediate attention. +var criticalXids = map[uint64]bool{ + // Memory errors + 48: true, // Double Bit ECC Error + 63: true, // Row remapping failure + 64: true, // Uncontained ECC error + 74: true, // NVLink error + 79: true, // GPU has fallen off the bus + + // Fatal errors + 94: true, // Contained ECC error (severe) + 95: true, // Uncontained ECC error + 119: true, // GSP (GPU System Processor) error + 120: true, // GSP firmware error +} + +// XidDescriptions provides human-readable descriptions for common XIDs. +var XidDescriptions = map[uint64]string{ + // Application errors (typically ignored) + 13: "Graphics Engine Exception", + 31: "GPU memory page fault", + 43: "GPU stopped processing", + 45: "Preemptive cleanup", + 68: "Video processor exception", + 109: "Context Switch Timeout", + + // Memory errors + 48: "Double Bit ECC Error", + 63: "Row remapping failure", + 64: "Uncontained ECC error", + 74: "NVLink error", + 79: "GPU has fallen off the bus", + 94: "Contained ECC error", + 95: "Uncontained ECC error", + + // Other notable XIDs + 8: "GPU not accessible", + 32: "Invalid or corrupted push buffer stream", + 38: "Driver firmware error", + 56: "Display engine error", + 57: "Error programming video memory interface", + 62: "Internal micro-controller halt (non-fatal)", + 69: "Graphics engine accessor error", + 119: "GSP error", + 120: "GSP firmware error", +} + +// IsDefaultIgnored returns true if the XID is in the default ignored set. +func IsDefaultIgnored(xid uint64) bool { + return defaultIgnoredXids[xid] +} + +// IsCritical returns true if the XID is in the critical set. +func IsCritical(xid uint64) bool { + return criticalXids[xid] +} + +// DefaultIgnoredXidsList returns a copy of the default ignored XID set. +func DefaultIgnoredXidsList() map[uint64]bool { + out := make(map[uint64]bool, len(defaultIgnoredXids)) + for k, v := range defaultIgnoredXids { + out[k] = v + } + return out +} + +// isIgnoredXid returns true if the XID should be ignored for health purposes. +// +// An XID is ignored if it's in the default ignored list OR in the additional +// ignored map provided by the user. The map is built once at provider startup +// from the config slice for O(1) lookup. +func isIgnoredXid(xid uint64, additionalIgnored map[uint64]bool) bool { + if defaultIgnoredXids[xid] { + return true + } + + return additionalIgnored[xid] +} + +// IsCriticalXid returns true if the XID indicates a critical hardware failure. +func IsCriticalXid(xid uint64) bool { + return criticalXids[xid] +} + +// xidToString returns a human-readable description for an XID. +func xidToString(xid uint64) string { + if desc, ok := XidDescriptions[xid]; ok { + return desc + } + + return "Unknown XID" +} + +// ParseIgnoredXids parses a comma-or-space-separated string of XID values. +// Non-numeric tokens are silently skipped. +func ParseIgnoredXids(input string) []uint64 { + if input == "" { + return nil + } + + var result []uint64 + + tokens := strings.FieldsFunc(input, func(r rune) bool { + return r == ',' || r == ' ' + }) + + for _, tok := range tokens { + v, err := strconv.ParseUint(tok, 10, 64) + if err != nil { + continue + } + + result = append(result, v) + } + + if len(result) == 0 { + return nil + } + + return result +} + +// XidSeverity represents the severity level of an XID error. +type XidSeverity int + +const ( + // XidSeverityUnknown indicates the XID severity is unknown. + XidSeverityUnknown XidSeverity = iota + // XidSeverityIgnored indicates the XID is typically caused by applications. + XidSeverityIgnored + // XidSeverityWarning indicates the XID may indicate a problem. + XidSeverityWarning + // XidSeverityCritical indicates the XID indicates a critical hardware failure. + XidSeverityCritical +) + +// Severity string constants. +const ( + severityUnknown = "unknown" + severityIgnored = "ignored" + severityWarning = "warning" + severityCritical = "critical" +) + +// GetXidSeverity returns the severity level for an XID. +func GetXidSeverity(xid uint64) XidSeverity { + if defaultIgnoredXids[xid] { + return XidSeverityIgnored + } + + if criticalXids[xid] { + return XidSeverityCritical + } + + // XIDs not in either list are treated as warnings + return XidSeverityWarning +} + +// String returns a string representation of XidSeverity. +func (s XidSeverity) String() string { + switch s { + case XidSeverityUnknown: + return severityUnknown + case XidSeverityIgnored: + return severityIgnored + case XidSeverityWarning: + return severityWarning + case XidSeverityCritical: + return severityCritical + default: + return severityUnknown + } +} diff --git a/pkg/providers/nvml/xid_test.go b/pkg/providers/nvml/xid_test.go new file mode 100644 index 000000000..f6d9eadaf --- /dev/null +++ b/pkg/providers/nvml/xid_test.go @@ -0,0 +1,279 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nvml + +import ( + "math" + "testing" +) + +func TestIsIgnoredXid_DefaultIgnored(t *testing.T) { + // Test default ignored XIDs + defaultIgnored := []uint64{13, 31, 43, 45, 68, 109} + + for _, xid := range defaultIgnored { + if !isIgnoredXid(xid, nil) { + t.Errorf("XID %d should be ignored by default", xid) + } + } +} + +func TestIsIgnoredXid_CriticalNotIgnored(t *testing.T) { + // Test critical XIDs are not ignored by default + criticalXids := []uint64{48, 63, 64, 74, 79, 94, 95, 119, 120} + + for _, xid := range criticalXids { + if isIgnoredXid(xid, nil) { + t.Errorf("Critical XID %d should not be ignored by default", xid) + } + } +} + +func TestIsIgnoredXid_AdditionalIgnored(t *testing.T) { + // Test additional ignored XIDs + additionalIgnored := map[uint64]bool{48: true, 63: true} // Make critical XIDs ignored + + // Normally critical, but now ignored + if !isIgnoredXid(48, additionalIgnored) { + t.Error("XID 48 should be ignored when in additional list") + } + + if !isIgnoredXid(63, additionalIgnored) { + t.Error("XID 63 should be ignored when in additional list") + } + + // Still critical (not in additional list) + if isIgnoredXid(64, additionalIgnored) { + t.Error("XID 64 should not be ignored (not in additional list)") + } +} + +func TestIsIgnoredXid_UnknownXid(t *testing.T) { + // Unknown XIDs should not be ignored + unknownXids := []uint64{1, 2, 3, 999, 12345} + + for _, xid := range unknownXids { + if isIgnoredXid(xid, nil) { + t.Errorf("Unknown XID %d should not be ignored", xid) + } + } +} + +func TestIsIgnoredXid_BoundaryValues(t *testing.T) { + // Boundary values should not be ignored + if isIgnoredXid(0, nil) { + t.Error("XID 0 should not be ignored") + } + + if isIgnoredXid(math.MaxUint64, nil) { + t.Error("XID MaxUint64 should not be ignored") + } +} + +func TestIsCriticalXid(t *testing.T) { + tests := []struct { + xid uint64 + expected bool + }{ + // Critical XIDs + {48, true}, + {63, true}, + {64, true}, + {74, true}, + {79, true}, + {94, true}, + {95, true}, + {119, true}, + {120, true}, + + // Non-critical XIDs + {13, false}, + {31, false}, + {43, false}, + {1, false}, + {999, false}, + + // Boundary values + {0, false}, + {math.MaxUint64, false}, + } + + for _, tt := range tests { + result := IsCriticalXid(tt.xid) + if result != tt.expected { + t.Errorf("IsCriticalXid(%d) = %v, want %v", tt.xid, result, tt.expected) + } + } +} + +func TestXidToString(t *testing.T) { + tests := []struct { + xid uint64 + expected string + }{ + {13, "Graphics Engine Exception"}, + {31, "GPU memory page fault"}, + {48, "Double Bit ECC Error"}, + {79, "GPU has fallen off the bus"}, + {109, "Context Switch Timeout"}, + {999, "Unknown XID"}, + {0, "Unknown XID"}, + } + + for _, tt := range tests { + result := xidToString(tt.xid) + if result != tt.expected { + t.Errorf("xidToString(%d) = %q, want %q", tt.xid, result, tt.expected) + } + } +} + +func TestParseIgnoredXids(t *testing.T) { + tests := []struct { + name string + input string + expected []uint64 + }{ + { + name: "empty string", + input: "", + expected: nil, + }, + { + name: "single value", + input: "48", + expected: []uint64{48}, + }, + { + name: "multiple comma separated", + input: "48,63,64", + expected: []uint64{48, 63, 64}, + }, + { + name: "with spaces", + input: "48, 63, 64", + expected: []uint64{48, 63, 64}, + }, + { + name: "space separated", + input: "48 63 64", + expected: []uint64{48, 63, 64}, + }, + { + name: "mixed separators", + input: "48, 63 64,65", + expected: []uint64{48, 63, 64, 65}, + }, + { + name: "trailing comma", + input: "48,63,", + expected: []uint64{48, 63}, + }, + { + name: "leading comma", + input: ",48,63", + expected: []uint64{48, 63}, + }, + { + name: "non-numeric characters mixed in", + input: "4a8,63", + expected: []uint64{63}, + }, + { + name: "completely non-numeric", + input: "abc", + expected: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := ParseIgnoredXids(tt.input) + + if len(result) != len(tt.expected) { + t.Errorf("ParseIgnoredXids(%q) len = %d, want %d", tt.input, len(result), len(tt.expected)) + return + } + + for i, v := range result { + if v != tt.expected[i] { + t.Errorf("ParseIgnoredXids(%q)[%d] = %d, want %d", tt.input, i, v, tt.expected[i]) + } + } + }) + } +} + +func TestGetXidSeverity(t *testing.T) { + tests := []struct { + xid uint64 + expected XidSeverity + }{ + // Ignored (application errors) + {13, XidSeverityIgnored}, + {31, XidSeverityIgnored}, + {43, XidSeverityIgnored}, + {45, XidSeverityIgnored}, + {68, XidSeverityIgnored}, + {109, XidSeverityIgnored}, + + // Critical (hardware failures) + {48, XidSeverityCritical}, + {63, XidSeverityCritical}, + {64, XidSeverityCritical}, + {74, XidSeverityCritical}, + {79, XidSeverityCritical}, + {94, XidSeverityCritical}, + {95, XidSeverityCritical}, + {119, XidSeverityCritical}, + {120, XidSeverityCritical}, + + // Warning (unknown XIDs) + {1, XidSeverityWarning}, + {2, XidSeverityWarning}, + {999, XidSeverityWarning}, + + // Boundary values + {0, XidSeverityWarning}, + {math.MaxUint64, XidSeverityWarning}, + } + + for _, tt := range tests { + result := GetXidSeverity(tt.xid) + if result != tt.expected { + t.Errorf("GetXidSeverity(%d) = %v, want %v", tt.xid, result, tt.expected) + } + } +} + +func TestXidSeverity_String(t *testing.T) { + tests := []struct { + severity XidSeverity + expected string + }{ + {XidSeverityUnknown, "unknown"}, + {XidSeverityIgnored, "ignored"}, + {XidSeverityWarning, "warning"}, + {XidSeverityCritical, "critical"}, + {XidSeverity(99), "unknown"}, // Invalid severity + } + + for _, tt := range tests { + result := tt.severity.String() + if result != tt.expected { + t.Errorf("XidSeverity(%d).String() = %q, want %q", tt.severity, result, tt.expected) + } + } +} diff --git a/pkg/services/device/v1alpha1/gpu_provider.go b/pkg/services/device/v1alpha1/gpu_provider.go index 32dc779bd..7f11c98e7 100644 --- a/pkg/services/device/v1alpha1/gpu_provider.go +++ b/pkg/services/device/v1alpha1/gpu_provider.go @@ -1,81 +1,76 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by service-gen. DO NOT EDIT. +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package v1alpha1 import ( "fmt" - "path" devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1" pb "github.com/nvidia/nvsentinel/internal/generated/device/v1alpha1" "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/api" "github.com/nvidia/nvsentinel/pkg/controlplane/apiserver/registry" + "github.com/nvidia/nvsentinel/pkg/storage/memory" "google.golang.org/grpc" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/runtime/serializer" "k8s.io/apiserver/pkg/storage/storagebackend" - "k8s.io/apiserver/pkg/storage/storagebackend/factory" ) func init() { registry.Register(NewGPUServiceProvider()) } -type gpuServiceProvider struct{ +type gpuServiceProvider struct { groupVersion schema.GroupVersion } +// NewGPUServiceProvider returns a ServiceProvider that installs the GPU gRPC service. func NewGPUServiceProvider() api.ServiceProvider { return &gpuServiceProvider{ groupVersion: devicev1alpha1.SchemeGroupVersion, } } -func (p *gpuServiceProvider) Install(svr *grpc.Server, storageConfig storagebackend.Config) (api.Service, error) { +// Install creates the in-memory storage backend and registers the GPU service +// on the provided gRPC server. +func (p *gpuServiceProvider) Install(svr *grpc.Server, cfg storagebackend.Config) (api.Service, error) { + // Currently only in-memory storage is supported. The cfg parameter is + // accepted for future extensibility but not used for backend selection. + _ = cfg + gv := p.groupVersion.String() scheme := runtime.NewScheme() if err := devicev1alpha1.AddToScheme(scheme); err != nil { return nil, fmt.Errorf("failed to add %q to scheme: %w", gv, err) } - - codecs := serializer.NewCodecFactory(scheme) - codec := codecs.LegacyCodec(p.groupVersion) - configForResource := storagebackend.ConfigForResource{ - Config: storageConfig, + codecs := serializer.NewCodecFactory(scheme) + info, ok := runtime.SerializerInfoForMediaType(codecs.SupportedMediaTypes(), runtime.ContentTypeJSON) + if !ok { + return nil, fmt.Errorf("no serializer found for %s in %s", runtime.ContentTypeJSON, gv) } - configForResource.Config.Codec = codec - - resourcePrefix := path.Join("/", p.groupVersion.Group, "gpus") + codec := codecs.CodecForVersions(info.Serializer, info.Serializer, schema.GroupVersions{p.groupVersion}, schema.GroupVersions{p.groupVersion}) - s, destroyFunc, err := factory.Create( - configForResource, - func() runtime.Object { return &devicev1alpha1.GPU{} }, - func() runtime.Object { return &devicev1alpha1.GPUList{} }, - resourcePrefix, - ) + s, destroyFunc, err := memory.CreateStorage(codec) if err != nil { - return nil, fmt.Errorf("failed to create storage for %s: %w", resourcePrefix, err) + return nil, fmt.Errorf("failed to create in-memory storage for %s: %w", gv, err) } service := NewGPUService(s, destroyFunc) - pb.RegisterGpuServiceServer(svr, service) return service, nil diff --git a/pkg/services/device/v1alpha1/gpu_service.go b/pkg/services/device/v1alpha1/gpu_service.go index 3bff930d5..f4434ef6d 100644 --- a/pkg/services/device/v1alpha1/gpu_service.go +++ b/pkg/services/device/v1alpha1/gpu_service.go @@ -1,18 +1,16 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by service-gen. DO NOT EDIT. +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package v1alpha1 @@ -21,9 +19,11 @@ import ( "fmt" "path" "reflect" + "regexp" devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1" pb "github.com/nvidia/nvsentinel/internal/generated/device/v1alpha1" + "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" "google.golang.org/protobuf/types/known/emptypb" @@ -38,21 +38,24 @@ import ( type gpuService struct { pb.UnimplementedGpuServiceServer - storage storage.Interface + storage storage.Interface destroyFunc factory.DestroyFunc } +// NewGPUService creates a new GPU gRPC service backed by the provided storage. func NewGPUService(storage storage.Interface, destroyFunc factory.DestroyFunc) *gpuService { return &gpuService{ - storage: storage, + storage: storage, destroyFunc: destroyFunc, } } +// Name returns the fully qualified gRPC service name. func (s *gpuService) Name() string { return pb.GpuService_ServiceDesc.ServiceName } +// IsReady reports whether the underlying storage backend is healthy. func (s *gpuService) IsReady() bool { if s.storage == nil { return false @@ -60,6 +63,7 @@ func (s *gpuService) IsReady() bool { return s.storage.ReadinessCheck() == nil } +// Cleanup shuts down the storage backend. func (s *gpuService) Cleanup() { if s.destroyFunc != nil { klog.V(2).InfoS("Shutting down storage backend", "service", s.Name()) @@ -67,22 +71,67 @@ func (s *gpuService) Cleanup() { } } -func (s *gpuService) storageKey(ns string, name string) string { - base := path.Join("/", devicev1alpha1.SchemeGroupVersion.Group, "gpus") +// normalizeNamespace returns "default" if ns is empty. +func normalizeNamespace(ns string) string { + if ns == "" { + return "default" + } + return ns +} + +// validateNamespace checks that ns does not exceed the K8s maximum namespace length. +// An empty namespace is valid (it defaults to "default" elsewhere). +func validateNamespace(ns string) error { + if ns == "" { + return nil + } + if len(ns) > 253 { // K8s namespace max length + return status.Error(codes.InvalidArgument, "namespace exceeds maximum length of 253 characters") + } + return nil +} + +// gpuUUIDPattern matches NVIDIA GPU UUIDs +// (e.g., GPU-12345678-1234-1234-1234-123456789abc). +var gpuUUIDPattern = regexp.MustCompile( + `^GPU-[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-` + + `[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$`, +) + +// validateGPUName checks that name is non-empty and matches +// the NVIDIA GPU UUID format. +func validateGPUName(name string) error { + if name == "" { + return status.Error(codes.InvalidArgument, "name is required") + } - if ns == "" && name != "" { - ns = "default" + if !gpuUUIDPattern.MatchString(name) { + return status.Errorf(codes.InvalidArgument, + "name must be a valid GPU UUID "+ + "(GPU-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx), got %q", + name) } - // Pattern: /device.nvidia.com/gpus// + return nil +} + +func (s *gpuService) storageKey(ns string, name string) string { + base := path.Join("/", devicev1alpha1.SchemeGroupVersion.Group, "gpus") + if name != "" { + ns = normalizeNamespace(ns) + } return path.Join(base, ns, name) } +// GetGpu retrieves a single GPU resource. func (s *gpuService) GetGpu(ctx context.Context, req *pb.GetGpuRequest) (*pb.GetGpuResponse, error) { logger := klog.FromContext(ctx) - if req.GetName() == "" { - return nil, status.Error(codes.InvalidArgument, "name is required") + if err := validateGPUName(req.GetName()); err != nil { + return nil, err + } + if err := validateNamespace(req.GetNamespace()); err != nil { + return nil, err } key := s.storageKey(req.GetNamespace(), req.GetName()) @@ -106,15 +155,20 @@ func (s *gpuService) GetGpu(ctx context.Context, req *pb.GetGpuRequest) (*pb.Get }, nil } +// ListGpus retrieves a list of GPU resources. func (s *gpuService) ListGpus(ctx context.Context, req *pb.ListGpusRequest) (*pb.ListGpusResponse, error) { logger := klog.FromContext(ctx) + if err := validateNamespace(req.GetNamespace()); err != nil { + return nil, err + } + var gpus devicev1alpha1.GPUList opts := storage.ListOptions{ ResourceVersion: req.GetOpts().GetResourceVersion(), Recursive: true, - Predicate: storage.Everything, // TODO: selection predicate + Predicate: storage.Everything, } key := s.storageKey(req.GetNamespace(), "") @@ -125,7 +179,6 @@ func (s *gpuService) ListGpus(ctx context.Context, req *pb.ListGpusRequest) (*pb if rv == 0 { rvStr = req.GetOpts().GetResourceVersion() } - return &pb.ListGpusResponse{ GpuList: &pb.GpuList{ Metadata: &pb.ListMeta{ @@ -150,7 +203,8 @@ func (s *gpuService) ListGpus(ctx context.Context, req *pb.ListGpusRequest) (*pb }, nil } -func (s *gpuService) WatchGpus(req *pb.WatchGpusRequest, stream pb.GpuService_WatchGpusServer) error { +// WatchGpus streams lifecycle events for GPU resources. +func (s *gpuService) WatchGpus(req *pb.WatchGpusRequest, stream grpc.ServerStreamingServer[pb.WatchGpusResponse]) error { ctx := stream.Context() logger := klog.FromContext(ctx) @@ -159,9 +213,9 @@ func (s *gpuService) WatchGpus(req *pb.WatchGpusRequest, stream pb.GpuService_Wa key := s.storageKey(req.GetNamespace(), "") w, err := s.storage.Watch(ctx, key, storage.ListOptions{ - ResourceVersion: req.GetOpts().GetResourceVersion(), + ResourceVersion: rv, Recursive: true, - Predicate: storage.Everything, // TODO: selection predicate + Predicate: storage.Everything, }) if err != nil { if storage.IsInvalidError(err) { @@ -226,25 +280,25 @@ func (s *gpuService) WatchGpus(req *pb.WatchGpusRequest, stream pb.GpuService_Wa } } +// CreateGpu creates a single GPU resource. func (s *gpuService) CreateGpu(ctx context.Context, req *pb.CreateGpuRequest) (*pb.Gpu, error) { logger := klog.FromContext(ctx) if req.GetGpu() == nil { return nil, status.Error(codes.InvalidArgument, "resource body is required") } - if req.GetGpu().GetMetadata() == nil || req.GetGpu().GetMetadata().GetName() == "" { + if req.GetGpu().GetMetadata() == nil { return nil, status.Error(codes.InvalidArgument, "metadata.name: Required value") } + if err := validateGPUName(req.GetGpu().GetMetadata().GetName()); err != nil { + return nil, err + } name := req.GetGpu().GetMetadata().GetName() - ns := req.GetGpu().GetMetadata().GetNamespace() - if ns == "" { - ns = "default" - } + ns := normalizeNamespace(req.GetGpu().GetMetadata().GetNamespace()) key := s.storageKey(ns, name) gpu := devicev1alpha1.FromProto(req.Gpu) - // TODO: move into PrepareForCreate function? gpu.SetNamespace(ns) gpu.SetUID(uuid.NewUUID()) now := metav1.Now() @@ -270,15 +324,19 @@ func (s *gpuService) CreateGpu(ctx context.Context, req *pb.CreateGpuRequest) (* return devicev1alpha1.ToProto(out), nil } +// UpdateGpu updates a single GPU resource (spec only). func (s *gpuService) UpdateGpu(ctx context.Context, req *pb.UpdateGpuRequest) (*pb.Gpu, error) { logger := klog.FromContext(ctx) if req.GetGpu() == nil { return nil, status.Error(codes.InvalidArgument, "resource body is required") } - if req.GetGpu().GetMetadata() == nil || req.GetGpu().GetMetadata().GetName() == "" { + if req.GetGpu().GetMetadata() == nil { return nil, status.Error(codes.InvalidArgument, "metadata.name: Required value") } + if err := validateGPUName(req.GetGpu().GetMetadata().GetName()); err != nil { + return nil, err + } name := req.GetGpu().GetMetadata().GetName() ns := req.GetGpu().GetMetadata().GetNamespace() @@ -289,8 +347,8 @@ func (s *gpuService) UpdateGpu(ctx context.Context, req *pb.UpdateGpuRequest) (* ctx, key, updatedGpu, - false, // ignoreNotFound - nil, // TODO: preconditions + false, + nil, func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { curr := input.(*devicev1alpha1.GPU) incoming := devicev1alpha1.FromProto(req.GetGpu()) @@ -319,7 +377,7 @@ func (s *gpuService) UpdateGpu(ctx context.Context, req *pb.UpdateGpuRequest) (* return clone, nil, nil }, - nil, // TODO: cachedExistingObject + nil, ) if err != nil { @@ -345,11 +403,76 @@ func (s *gpuService) UpdateGpu(ctx context.Context, req *pb.UpdateGpuRequest) (* return devicev1alpha1.ToProto(updatedGpu), nil } +// UpdateGpuStatus updates only the status subresource of a GPU. +func (s *gpuService) UpdateGpuStatus(ctx context.Context, req *pb.UpdateGpuStatusRequest) (*pb.Gpu, error) { + logger := klog.FromContext(ctx) + + if req.GetGpu() == nil { + return nil, status.Error(codes.InvalidArgument, "resource body is required") + } + if req.GetGpu().GetMetadata() == nil { + return nil, status.Error(codes.InvalidArgument, "metadata.name: Required value") + } + if err := validateGPUName(req.GetGpu().GetMetadata().GetName()); err != nil { + return nil, err + } + if req.GetGpu().GetStatus() == nil { + return nil, status.Error(codes.InvalidArgument, "status is required") + } + + name := req.GetGpu().GetMetadata().GetName() + ns := req.GetGpu().GetMetadata().GetNamespace() + key := s.storageKey(ns, name) + updatedGpu := &devicev1alpha1.GPU{} + + err := s.storage.GuaranteedUpdate( + ctx, + key, + updatedGpu, + false, + nil, + func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { + curr := input.(*devicev1alpha1.GPU) + incoming := devicev1alpha1.FromProto(req.GetGpu()) + + if incoming.ResourceVersion != "" && incoming.ResourceVersion != curr.ResourceVersion { + return nil, nil, storage.NewResourceVersionConflictsError(key, 0) + } + + clone := curr.DeepCopy() + clone.Status = incoming.Status + + return clone, nil, nil + }, + nil, + ) + + if err != nil { + if storage.IsNotFound(err) { + return nil, status.Errorf(codes.NotFound, "GPU %q not found", name) + } + if storage.IsConflict(err) { + return nil, status.Errorf(codes.Aborted, + "operation cannot be fulfilled on GPUs %q: the object has been modified", name) + } + logger.Error(err, "failed to update GPU status", "name", name, "namespace", ns) + return nil, status.Error(codes.Internal, "internal server error") + } + + logger.V(2).Info("Successfully updated GPU status", "name", name, "namespace", ns, "resourceVersion", updatedGpu.ResourceVersion) + + return devicev1alpha1.ToProto(updatedGpu), nil +} + +// DeleteGpu deletes a single GPU resource. func (s *gpuService) DeleteGpu(ctx context.Context, req *pb.DeleteGpuRequest) (*emptypb.Empty, error) { logger := klog.FromContext(ctx) - if req.GetName() == "" { - return nil, status.Error(codes.InvalidArgument, "name is required") + if err := validateGPUName(req.GetName()); err != nil { + return nil, err + } + if err := validateNamespace(req.GetNamespace()); err != nil { + return nil, err } name := req.GetName() @@ -361,10 +484,10 @@ func (s *gpuService) DeleteGpu(ctx context.Context, req *pb.DeleteGpuRequest) (* ctx, key, out, - nil, // TODO: preconditions (e.g., rv check) + nil, storage.ValidateAllObjectFunc, - nil, // TODO: cachedExistingObject - storage.DeleteOptions{}, // TODO: DeleteOptions + nil, + storage.DeleteOptions{}, ); err != nil { if storage.IsNotFound(err) { return nil, status.Errorf(codes.NotFound, "GPU %q not found", name) diff --git a/pkg/services/device/v1alpha1/gpu_service_test.go b/pkg/services/device/v1alpha1/gpu_service_test.go new file mode 100644 index 000000000..184869c97 --- /dev/null +++ b/pkg/services/device/v1alpha1/gpu_service_test.go @@ -0,0 +1,494 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha1 + +import ( + "context" + "strings" + "testing" + + devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1" + pb "github.com/nvidia/nvsentinel/internal/generated/device/v1alpha1" + "github.com/nvidia/nvsentinel/pkg/storage/memory" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/runtime/serializer" +) + +func newTestService(t *testing.T) *gpuService { + t.Helper() + + scheme := runtime.NewScheme() + if err := devicev1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + codecs := serializer.NewCodecFactory(scheme) + gv := devicev1alpha1.SchemeGroupVersion + info, _ := runtime.SerializerInfoForMediaType(codecs.SupportedMediaTypes(), runtime.ContentTypeJSON) + codec := codecs.CodecForVersions(info.Serializer, info.Serializer, schema.GroupVersions{gv}, schema.GroupVersions{gv}) + + s, destroy, err := memory.CreateStorage(codec) + if err != nil { + t.Fatal(err) + } + t.Cleanup(destroy) + + return NewGPUService(s, destroy) +} + +func createTestGpu(t *testing.T, svc *gpuService, name string) *pb.Gpu { + t.Helper() + + gpu, err := svc.CreateGpu(context.Background(), &pb.CreateGpuRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: name, + Namespace: "default", + }, + Spec: &pb.GpuSpec{ + Uuid: name, + }, + }, + }) + if err != nil { + t.Fatalf("failed to create GPU %q: %v", name, err) + } + + return gpu +} + +func TestGPUService_CreateAndGet(t *testing.T) { + svc := newTestService(t) + ctx := context.Background() + + const gpuName = "GPU-00000000-0000-0000-0000-000000000000" + created := createTestGpu(t, svc, gpuName) + + if created.GetMetadata().GetName() != gpuName { + t.Errorf("expected name %q, got %q", gpuName, created.GetMetadata().GetName()) + } + if created.GetMetadata().GetUid() == "" { + t.Error("expected UID to be set on created GPU") + } + + resp, err := svc.GetGpu(ctx, &pb.GetGpuRequest{ + Name: gpuName, + Namespace: "default", + }) + if err != nil { + t.Fatalf("GetGpu failed: %v", err) + } + + got := resp.GetGpu() + if got.GetMetadata().GetName() != gpuName { + t.Errorf("expected name %q, got %q", gpuName, got.GetMetadata().GetName()) + } + if got.GetMetadata().GetUid() != created.GetMetadata().GetUid() { + t.Errorf("UID mismatch: expected %q, got %q", + created.GetMetadata().GetUid(), got.GetMetadata().GetUid()) + } +} + +func TestGPUService_CreateDuplicate(t *testing.T) { + svc := newTestService(t) + + const gpuName = "GPU-11111111-1111-1111-1111-111111111111" + createTestGpu(t, svc, gpuName) + + _, err := svc.CreateGpu(context.Background(), &pb.CreateGpuRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + }, + Spec: &pb.GpuSpec{ + Uuid: gpuName, + }, + }, + }) + if err == nil { + t.Fatal("expected error for duplicate create, got nil") + } + + st, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got %T: %v", err, err) + } + if st.Code() != codes.AlreadyExists { + t.Errorf("expected code %v, got %v: %s", codes.AlreadyExists, st.Code(), st.Message()) + } +} + +func TestGPUService_List(t *testing.T) { + svc := newTestService(t) + ctx := context.Background() + + createTestGpu(t, svc, "GPU-aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa") + createTestGpu(t, svc, "GPU-bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb") + + resp, err := svc.ListGpus(ctx, &pb.ListGpusRequest{ + Namespace: "default", + }) + if err != nil { + t.Fatalf("ListGpus failed: %v", err) + } + + count := len(resp.GetGpuList().GetItems()) + if count != 2 { + t.Errorf("expected 2 GPUs, got %d", count) + } +} + +func TestGPUService_Delete(t *testing.T) { + svc := newTestService(t) + ctx := context.Background() + + const gpuName = "GPU-22222222-2222-2222-2222-222222222222" + createTestGpu(t, svc, gpuName) + + _, err := svc.DeleteGpu(ctx, &pb.DeleteGpuRequest{ + Name: gpuName, + Namespace: "default", + }) + if err != nil { + t.Fatalf("DeleteGpu failed: %v", err) + } + + _, err = svc.GetGpu(ctx, &pb.GetGpuRequest{ + Name: gpuName, + Namespace: "default", + }) + if err == nil { + t.Fatal("expected NotFound after delete, got nil") + } + + st, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got %T: %v", err, err) + } + if st.Code() != codes.NotFound { + t.Errorf("expected code %v, got %v: %s", codes.NotFound, st.Code(), st.Message()) + } +} + +func TestGPUService_DeleteNotFound(t *testing.T) { + svc := newTestService(t) + + _, err := svc.DeleteGpu(context.Background(), &pb.DeleteGpuRequest{ + Name: "GPU-ffffffff-ffff-ffff-ffff-ffffffffffff", + Namespace: "default", + }) + if err == nil { + t.Fatal("expected NotFound error, got nil") + } + + st, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got %T: %v", err, err) + } + if st.Code() != codes.NotFound { + t.Errorf("expected code %v, got %v: %s", codes.NotFound, st.Code(), st.Message()) + } +} + +func TestGPUService_Update(t *testing.T) { + svc := newTestService(t) + ctx := context.Background() + + const gpuName = "GPU-33333333-3333-3333-3333-333333333333" + created := createTestGpu(t, svc, gpuName) + + updated, err := svc.UpdateGpu(ctx, &pb.UpdateGpuRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + }, + Spec: &pb.GpuSpec{ + Uuid: "GPU-new-uuid", + }, + }, + }) + if err != nil { + t.Fatalf("UpdateGpu failed: %v", err) + } + + if updated.GetSpec().GetUuid() != "GPU-new-uuid" { + t.Errorf("expected spec.uuid %q, got %q", "GPU-new-uuid", updated.GetSpec().GetUuid()) + } + if updated.GetMetadata().GetGeneration() != created.GetMetadata().GetGeneration()+1 { + t.Errorf("expected generation %d, got %d", + created.GetMetadata().GetGeneration()+1, updated.GetMetadata().GetGeneration()) + } +} + +func TestGPUService_UpdateStatus(t *testing.T) { + svc := newTestService(t) + ctx := context.Background() + + const gpuName = "GPU-44444444-4444-4444-4444-444444444444" + created := createTestGpu(t, svc, gpuName) + + updated, err := svc.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + }, + Status: &pb.GpuStatus{ + RecommendedAction: "drain", + }, + }, + }) + if err != nil { + t.Fatalf("UpdateGpuStatus failed: %v", err) + } + + if updated.GetStatus().GetRecommendedAction() != "drain" { + t.Errorf("expected recommended action %q, got %q", + "drain", updated.GetStatus().GetRecommendedAction()) + } + + // Generation must NOT change on status-only updates. + if updated.GetMetadata().GetGeneration() != created.GetMetadata().GetGeneration() { + t.Errorf("expected generation %d (unchanged), got %d", + created.GetMetadata().GetGeneration(), updated.GetMetadata().GetGeneration()) + } +} + +func TestGPUService_UpdateStatus_StaleResourceVersion(t *testing.T) { + svc := newTestService(t) + ctx := context.Background() + + const gpuName = "GPU-55555555-5555-5555-5555-555555555555" + created := createTestGpu(t, svc, gpuName) + staleRV := created.GetMetadata().GetResourceVersion() + + // Update spec to increment the resource version. + _, err := svc.UpdateGpu(ctx, &pb.UpdateGpuRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + }, + Spec: &pb.GpuSpec{ + Uuid: "GPU-updated-uuid", + }, + }, + }) + if err != nil { + t.Fatalf("UpdateGpu failed: %v", err) + } + + // Attempt status update with the stale resource version. + _, err = svc.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + ResourceVersion: staleRV, + }, + Status: &pb.GpuStatus{ + RecommendedAction: "drain", + }, + }, + }) + if err == nil { + t.Fatal("expected error for stale resource version, got nil") + } + + st, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got %T: %v", err, err) + } + if st.Code() != codes.Aborted { + t.Errorf("expected code %v, got %v: %s", codes.Aborted, st.Code(), st.Message()) + } +} + +func TestGPUService_UpdateStatus_NilStatus(t *testing.T) { + svc := newTestService(t) + ctx := context.Background() + + const gpuName = "GPU-66666666-6666-6666-6666-666666666666" + createTestGpu(t, svc, gpuName) + + _, err := svc.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + }, + Status: nil, + }, + }) + if err == nil { + t.Fatal("expected error for nil status, got nil") + } + + st, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got %T: %v", err, err) + } + if st.Code() != codes.InvalidArgument { + t.Errorf("expected code %v, got %v: %s", codes.InvalidArgument, st.Code(), st.Message()) + } +} + +func TestGPUService_UpdateStatus_EmptyConditions(t *testing.T) { + svc := newTestService(t) + ctx := context.Background() + + const gpuName = "GPU-77777777-7777-7777-7777-777777777777" + createTestGpu(t, svc, gpuName) + + // First set a condition. + _, err := svc.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + }, + Status: &pb.GpuStatus{ + Conditions: []*pb.Condition{ + { + Type: "Ready", + Status: "True", + Reason: "TestReason", + }, + }, + RecommendedAction: "drain", + }, + }, + }) + if err != nil { + t.Fatalf("UpdateGpuStatus (set condition) failed: %v", err) + } + + // Now update with empty conditions to clear them. + updated, err := svc.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + }, + Status: &pb.GpuStatus{ + Conditions: []*pb.Condition{}, + RecommendedAction: "none", + }, + }, + }) + if err != nil { + t.Fatalf("UpdateGpuStatus (clear conditions) failed: %v", err) + } + + if len(updated.GetStatus().GetConditions()) != 0 { + t.Errorf("expected 0 conditions after clearing, got %d", len(updated.GetStatus().GetConditions())) + } + if updated.GetStatus().GetRecommendedAction() != "none" { + t.Errorf("expected recommended action %q, got %q", "none", updated.GetStatus().GetRecommendedAction()) + } +} + +func TestGPUService_CreateValidation(t *testing.T) { + svc := newTestService(t) + + tests := []struct { + name string + req *pb.CreateGpuRequest + }{ + { + name: "nil gpu body", + req: &pb.CreateGpuRequest{}, + }, + { + name: "nil metadata", + req: &pb.CreateGpuRequest{ + Gpu: &pb.Gpu{ + Spec: &pb.GpuSpec{Uuid: "GPU-test"}, + }, + }, + }, + { + name: "empty name", + req: &pb.CreateGpuRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{Name: ""}, + Spec: &pb.GpuSpec{Uuid: "GPU-test"}, + }, + }, + }, + { + name: "invalid GPU UUID format", + req: &pb.CreateGpuRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{Name: "not-a-gpu-uuid"}, + Spec: &pb.GpuSpec{Uuid: "GPU-test"}, + }, + }, + }, + { + name: "path traversal in name", + req: &pb.CreateGpuRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{Name: "../../etc/passwd"}, + Spec: &pb.GpuSpec{Uuid: "GPU-test"}, + }, + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + _, err := svc.CreateGpu(context.Background(), tc.req) + if err == nil { + t.Fatal("expected InvalidArgument error, got nil") + } + + st, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got %T: %v", err, err) + } + if st.Code() != codes.InvalidArgument { + t.Errorf("expected code %v, got %v: %s", codes.InvalidArgument, st.Code(), st.Message()) + } + }) + } +} + +func TestGPUService_NamespaceValidation(t *testing.T) { + svc := newTestService(t) + ctx := context.Background() + + longNS := strings.Repeat("a", 254) + + _, err := svc.GetGpu(ctx, &pb.GetGpuRequest{ + Name: "GPU-00000000-0000-0000-0000-000000000000", + Namespace: longNS, + }) + if err == nil { + t.Fatal("expected InvalidArgument for long namespace, got nil") + } + st, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got %T: %v", err, err) + } + if st.Code() != codes.InvalidArgument { + t.Errorf("expected code %v, got %v: %s", codes.InvalidArgument, st.Code(), st.Message()) + } +} diff --git a/pkg/services/device/v1alpha1/integration_test.go b/pkg/services/device/v1alpha1/integration_test.go new file mode 100644 index 000000000..f84344575 --- /dev/null +++ b/pkg/services/device/v1alpha1/integration_test.go @@ -0,0 +1,408 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha1_test + +import ( + "io" + "testing" + "time" + + pb "github.com/nvidia/nvsentinel/internal/generated/device/v1alpha1" + "github.com/nvidia/nvsentinel/pkg/testutil" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "google.golang.org/protobuf/types/known/timestamppb" +) + +// TestIntegration_CRUD performs a full Create→Get→List→Update→Delete cycle over gRPC. +func TestIntegration_CRUD(t *testing.T) { + client := testutil.NewTestGPUClient(t) + ctx := t.Context() + + const gpuName = "GPU-12345678-1234-1234-1234-123456789abc" + + // Create a GPU + created, err := client.CreateGpu(ctx, &pb.CreateGpuRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + }, + Spec: &pb.GpuSpec{ + Uuid: "GPU-1234", + }, + }, + }) + if err != nil { + t.Fatalf("CreateGpu failed: %v", err) + } + + if created.GetMetadata().GetName() != gpuName { + t.Errorf("expected name %q, got %q", gpuName, created.GetMetadata().GetName()) + } + if created.GetSpec().GetUuid() != "GPU-1234" { + t.Errorf("expected UUID %q, got %q", "GPU-1234", created.GetSpec().GetUuid()) + } + if created.GetMetadata().GetUid() == "" { + t.Error("expected UID to be set") + } + + // Get it back + getResp, err := client.GetGpu(ctx, &pb.GetGpuRequest{ + Name: gpuName, + Namespace: "default", + }) + if err != nil { + t.Fatalf("GetGpu failed: %v", err) + } + + got := getResp.GetGpu() + if got.GetSpec().GetUuid() != "GPU-1234" { + t.Errorf("expected UUID %q, got %q", "GPU-1234", got.GetSpec().GetUuid()) + } + + // List namespace "default" + listResp, err := client.ListGpus(ctx, &pb.ListGpusRequest{ + Namespace: "default", + }) + if err != nil { + t.Fatalf("ListGpus failed: %v", err) + } + + if len(listResp.GetGpuList().GetItems()) != 1 { + t.Errorf("expected 1 GPU, got %d", len(listResp.GetGpuList().GetItems())) + } + + // Update the spec (change UUID to "GPU-5678") + got.Spec.Uuid = "GPU-5678" + updated, err := client.UpdateGpu(ctx, &pb.UpdateGpuRequest{ + Gpu: got, + }) + if err != nil { + t.Fatalf("UpdateGpu failed: %v", err) + } + + if updated.GetSpec().GetUuid() != "GPU-5678" { + t.Errorf("expected UUID %q, got %q", "GPU-5678", updated.GetSpec().GetUuid()) + } + + // Verify change persists + getResp2, err := client.GetGpu(ctx, &pb.GetGpuRequest{ + Name: gpuName, + Namespace: "default", + }) + if err != nil { + t.Fatalf("GetGpu (after update) failed: %v", err) + } + + if getResp2.GetGpu().GetSpec().GetUuid() != "GPU-5678" { + t.Errorf("expected UUID %q after update, got %q", "GPU-5678", getResp2.GetGpu().GetSpec().GetUuid()) + } + + // Delete it + _, err = client.DeleteGpu(ctx, &pb.DeleteGpuRequest{ + Name: gpuName, + Namespace: "default", + }) + if err != nil { + t.Fatalf("DeleteGpu failed: %v", err) + } + + // List again, verify count=0 + listResp2, err := client.ListGpus(ctx, &pb.ListGpusRequest{ + Namespace: "default", + }) + if err != nil { + t.Fatalf("ListGpus (after delete) failed: %v", err) + } + + if len(listResp2.GetGpuList().GetItems()) != 0 { + t.Errorf("expected 0 GPUs after delete, got %d", len(listResp2.GetGpuList().GetItems())) + } +} + +// TestIntegration_Watch tests the streaming WatchGpus RPC. +func TestIntegration_Watch(t *testing.T) { + client := testutil.NewTestGPUClient(t) + ctx := t.Context() + + const gpuName = "GPU-aabbccdd-1122-3344-5566-778899aabbcc" + + // Start a watch stream + stream, err := client.WatchGpus(ctx, &pb.WatchGpusRequest{ + Namespace: "default", + }) + if err != nil { + t.Fatalf("WatchGpus failed to start: %v", err) + } + + // Create a GPU in a separate goroutine after a brief delay. + // The WatchGpus RPC returns a stream only after the server-side watch + // is established. However, the gRPC client dial and server handler setup + // may not be fully synchronized, so a small delay ensures the watch is + // ready to receive events. The main goroutine uses a 5s timeout on Recv + // as the real synchronization mechanism. + doneCh := make(chan struct{}) + go func() { + defer close(doneCh) + time.Sleep(100 * time.Millisecond) + _, err := client.CreateGpu(ctx, &pb.CreateGpuRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + }, + Spec: &pb.GpuSpec{ + Uuid: "GPU-WATCH-1", + }, + }, + }) + if err != nil { + t.Errorf("CreateGpu in watch test failed: %v", err) + } + }() + + // Wait for the ADDED event + timeout := time.After(5 * time.Second) + receivedEvent := false + + for !receivedEvent { + select { + case <-timeout: + t.Fatal("timeout waiting for watch event") + default: + event, err := stream.Recv() + if err == io.EOF { + t.Fatal("stream closed before receiving event") + } + if err != nil { + t.Fatalf("stream.Recv() failed: %v", err) + } + + if event.GetType() == "ADDED" && event.GetObject().GetMetadata().GetName() == gpuName { + receivedEvent = true + if event.GetObject().GetSpec().GetUuid() != "GPU-WATCH-1" { + t.Errorf("expected UUID %q, got %q", "GPU-WATCH-1", event.GetObject().GetSpec().GetUuid()) + } + } + } + } + + // Wait for the create goroutine to finish + <-doneCh + + // Clean up + _, err = client.DeleteGpu(ctx, &pb.DeleteGpuRequest{ + Name: gpuName, + Namespace: "default", + }) + if err != nil { + t.Errorf("cleanup DeleteGpu failed: %v", err) + } +} + +// TestIntegration_WatchWithResourceVersion_OutOfRange verifies that requesting +// a watch from a specific ResourceVersion returns codes.OutOfRange, because the +// in-memory store does not support watch resume. +func TestIntegration_WatchWithResourceVersion_OutOfRange(t *testing.T) { + client := testutil.NewTestGPUClient(t) + ctx := t.Context() + + stream, err := client.WatchGpus(ctx, &pb.WatchGpusRequest{ + Namespace: "default", + Opts: &pb.ListOptions{ + ResourceVersion: "1", + }, + }) + if err != nil { + t.Fatalf("WatchGpus failed to open stream: %v", err) + } + + // In gRPC server streaming, handler errors surface on Recv. + _, err = stream.Recv() + if err == nil { + t.Fatal("expected OutOfRange error for non-empty ResourceVersion, got nil") + } + if status.Code(err) != codes.OutOfRange { + t.Errorf("expected codes.OutOfRange, got %v: %v", status.Code(err), err) + } +} + +// TestIntegration_UpdateStatus tests the status subresource update. +func TestIntegration_UpdateStatus(t *testing.T) { + client := testutil.NewTestGPUClient(t) + ctx := t.Context() + + const gpuName = "GPU-55667788-aabb-ccdd-eeff-001122334455" + + // Create a GPU + created, err := client.CreateGpu(ctx, &pb.CreateGpuRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + }, + Spec: &pb.GpuSpec{ + Uuid: "GPU-STATUS-1", + }, + }, + }) + if err != nil { + t.Fatalf("CreateGpu failed: %v", err) + } + + // Update the status with a condition + updatedGpu, err := client.UpdateGpuStatus(ctx, &pb.UpdateGpuStatusRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + ResourceVersion: created.GetMetadata().GetResourceVersion(), + }, + Status: &pb.GpuStatus{ + Conditions: []*pb.Condition{ + { + Type: "Ready", + Status: "True", + LastTransitionTime: timestamppb.Now(), + Reason: "TestReason", + Message: "Test message", + }, + }, + RecommendedAction: "No action needed", + }, + }, + }) + if err != nil { + t.Fatalf("UpdateGpuStatus failed: %v", err) + } + + if len(updatedGpu.GetStatus().GetConditions()) != 1 { + t.Errorf("expected 1 condition, got %d", len(updatedGpu.GetStatus().GetConditions())) + } + + // Get the GPU and verify status was updated + getResp, err := client.GetGpu(ctx, &pb.GetGpuRequest{ + Name: gpuName, + Namespace: "default", + }) + if err != nil { + t.Fatalf("GetGpu failed: %v", err) + } + + gpu := getResp.GetGpu() + if len(gpu.GetStatus().GetConditions()) != 1 { + t.Errorf("expected 1 condition in retrieved GPU, got %d", len(gpu.GetStatus().GetConditions())) + } + + cond := gpu.GetStatus().GetConditions()[0] + if cond.GetType() != "Ready" { + t.Errorf("expected condition type %q, got %q", "Ready", cond.GetType()) + } + if cond.GetStatus() != "True" { + t.Errorf("expected condition status %q, got %q", "True", cond.GetStatus()) + } + if cond.GetReason() != "TestReason" { + t.Errorf("expected condition reason %q, got %q", "TestReason", cond.GetReason()) + } + if gpu.GetStatus().GetRecommendedAction() != "No action needed" { + t.Errorf("expected recommended action %q, got %q", "No action needed", gpu.GetStatus().GetRecommendedAction()) + } + + // Clean up + _, err = client.DeleteGpu(ctx, &pb.DeleteGpuRequest{ + Name: gpuName, + Namespace: "default", + }) + if err != nil { + t.Errorf("cleanup DeleteGpu failed: %v", err) + } +} + +// TestIntegration_ErrorCodes verifies correct gRPC error codes are returned. +func TestIntegration_ErrorCodes(t *testing.T) { + client := testutil.NewTestGPUClient(t) + ctx := t.Context() + + const gpuName = "GPU-deadbeef-dead-beef-dead-beefdeadbeef" + + // Get non-existent GPU → codes.NotFound + _, err := client.GetGpu(ctx, &pb.GetGpuRequest{ + Name: gpuName, + Namespace: "default", + }) + if err == nil { + t.Fatal("expected error for non-existent GPU") + } + if status.Code(err) != codes.NotFound { + t.Errorf("expected codes.NotFound, got %v", status.Code(err)) + } + + // Create a GPU + _, err = client.CreateGpu(ctx, &pb.CreateGpuRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + }, + Spec: &pb.GpuSpec{ + Uuid: "GPU-ERROR-1", + }, + }, + }) + if err != nil { + t.Fatalf("CreateGpu failed: %v", err) + } + + // Create duplicate → codes.AlreadyExists + _, err = client.CreateGpu(ctx, &pb.CreateGpuRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + }, + Spec: &pb.GpuSpec{ + Uuid: "GPU-ERROR-2", + }, + }, + }) + if err == nil { + t.Fatal("expected error for duplicate GPU creation") + } + if status.Code(err) != codes.AlreadyExists { + t.Errorf("expected codes.AlreadyExists, got %v", status.Code(err)) + } + + // Delete the GPU + _, err = client.DeleteGpu(ctx, &pb.DeleteGpuRequest{ + Name: gpuName, + Namespace: "default", + }) + if err != nil { + t.Fatalf("DeleteGpu failed: %v", err) + } + + // Delete non-existent → codes.NotFound + _, err = client.DeleteGpu(ctx, &pb.DeleteGpuRequest{ + Name: gpuName, + Namespace: "default", + }) + if err == nil { + t.Fatal("expected error for deleting non-existent GPU") + } + if status.Code(err) != codes.NotFound { + t.Errorf("expected codes.NotFound for delete, got %v", status.Code(err)) + } +} diff --git a/pkg/storage/memory/factory.go b/pkg/storage/memory/factory.go new file mode 100644 index 000000000..057dd2edb --- /dev/null +++ b/pkg/storage/memory/factory.go @@ -0,0 +1,32 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memory + +import ( + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apiserver/pkg/storage" + "k8s.io/apiserver/pkg/storage/storagebackend/factory" +) + +// CreateStorage returns a new in-memory storage.Interface, a DestroyFunc, and any error. +// This mirrors the signature of storagebackend/factory.Create() so it can be +// used as a drop-in replacement in ServiceProvider.Install(). +func CreateStorage(codec runtime.Codec) (storage.Interface, factory.DestroyFunc, error) { + store := NewStore(codec) + destroy := func() { + // No resources to release for in-memory storage. + } + return store, destroy, nil +} diff --git a/pkg/storage/memory/factory_test.go b/pkg/storage/memory/factory_test.go new file mode 100644 index 000000000..49a749e62 --- /dev/null +++ b/pkg/storage/memory/factory_test.go @@ -0,0 +1,62 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memory + +import ( + "context" + "testing" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apiserver/pkg/storage" +) + +func TestCreateStorage(t *testing.T) { + s, destroy, err := CreateStorage(codec) + if err != nil { + t.Fatalf("CreateStorage failed: %v", err) + } + defer destroy() + + if s == nil { + t.Fatal("expected non-nil storage.Interface") + } + + // Verify it's functional by doing a basic Create + Get. + ctx := context.Background() + obj := newTestObject("factory-gpu", "default") + if err := s.Create(ctx, "/test/factory-gpu", obj, nil, 0); err != nil { + t.Fatalf("Create via factory storage failed: %v", err) + } + + got := &unstructured.Unstructured{} + if err := s.Get(ctx, "/test/factory-gpu", storage.GetOptions{}, got); err != nil { + t.Fatalf("Get via factory storage failed: %v", err) + } + + if got.GetName() != "factory-gpu" { + t.Errorf("expected name factory-gpu, got %s", got.GetName()) + } +} + +func TestCreateStorage_DestroyIsIdempotent(t *testing.T) { + _, destroy, err := CreateStorage(codec) + if err != nil { + t.Fatal(err) + } + + // Should not panic when called multiple times. + destroy() + destroy() +} diff --git a/pkg/storage/memory/store.go b/pkg/storage/memory/store.go new file mode 100644 index 000000000..27b085383 --- /dev/null +++ b/pkg/storage/memory/store.go @@ -0,0 +1,492 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memory + +import ( + "bytes" + "context" + "fmt" + "strings" + "sync" + + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/validation/field" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/apiserver/pkg/storage" +) + +// item holds an encoded object and its associated resource version. +type item struct { + key string + data []byte + rv uint64 +} + +// Store is a thread-safe, in-memory implementation of storage.Interface. +// Objects are stored as codec-encoded bytes keyed by hierarchical path strings. +type Store struct { + codec runtime.Codec + mu sync.RWMutex + items map[string]*item + rev uint64 + watchers *watchManager +} + +// Compile-time interface compliance check. +var _ storage.Interface = (*Store)(nil) + +// NewStore creates a new in-memory store that encodes and decodes objects +// using the provided codec. The watch channel buffer uses the default size +// (watchChannelSize). Use NewStoreWithOptions for custom buffer sizes. +func NewStore(codec runtime.Codec) *Store { + return &Store{ + codec: codec, + items: make(map[string]*item), + watchers: newWatchManager(watchChannelSize), + } +} + +// Versioner returns the storage versioner used to manage resource versions on +// API objects. This implementation uses the standard APIObjectVersioner. +func (s *Store) Versioner() storage.Versioner { + return storage.APIObjectVersioner{} +} + +// Create adds a new object at the given key. If an object already exists at +// that key, a KeyExists error is returned. The out parameter, if non-nil, is +// populated with the stored object including its assigned resource version. +func (s *Store) Create(ctx context.Context, key string, obj, out runtime.Object, ttl uint64) error { + s.mu.Lock() + defer s.mu.Unlock() + + if _, exists := s.items[key]; exists { + return storage.NewKeyExistsError(key, 0) + } + + s.rev++ + rv := s.rev + + if err := s.Versioner().PrepareObjectForStorage(obj); err != nil { + return fmt.Errorf("PrepareObjectForStorage failed: %w", err) + } + + if err := s.Versioner().UpdateObject(obj, rv); err != nil { + return fmt.Errorf("UpdateObject failed: %w", err) + } + + data, err := s.encode(obj) + if err != nil { + return err + } + + s.items[key] = &item{ + key: key, + data: data, + rv: rv, + } + + if out != nil { + if err := s.decode(data, out); err != nil { + return err + } + } + + // DeepCopy is required: watchers must receive an isolated snapshot. + // The copy runs under s.mu write lock, so watch-heavy workloads + // should keep stored objects small. + s.watchers.sendLocked(watch.Event{ + Type: watch.Added, + Object: obj.DeepCopyObject(), + }, key) + + return nil +} + +// Delete removes the object at the given key. If the key does not exist, +// a KeyNotFound error is returned. Preconditions and validation callbacks +// are checked before deletion proceeds. +func (s *Store) Delete( + ctx context.Context, + key string, + out runtime.Object, + preconditions *storage.Preconditions, + validateDeletion storage.ValidateObjectFunc, + cachedExistingObject runtime.Object, + opts storage.DeleteOptions, +) error { + s.mu.Lock() + defer s.mu.Unlock() + + existing, ok := s.items[key] + if !ok { + return storage.NewKeyNotFoundError(key, 0) + } + + existingObj, err := s.decodeNew(existing.data) + if err != nil { + return err + } + + if err := s.checkPreconditions(key, preconditions, existingObj); err != nil { + return err + } + + // validateDeletion must be fast and non-blocking. It runs while the store + // write lock is held; a slow callback freezes all storage operations. + if validateDeletion != nil { + if err := validateDeletion(ctx, existingObj); err != nil { + return err + } + } + + delete(s.items, key) + + s.rev++ + + if out != nil { + if err := s.decode(existing.data, out); err != nil { + return err + } + } + + // Deep copy for watcher isolation. + s.watchers.sendLocked(watch.Event{ + Type: watch.Deleted, + Object: existingObj.DeepCopyObject(), + }, key) + + return nil +} + +// Watch begins watching the specified key prefix. Events matching the key +// prefix are sent on the returned watch.Interface. The watch is automatically +// stopped when the context is cancelled. +// +// The in-memory store does not support resuming watches from a specific +// ResourceVersion. Passing a non-empty ResourceVersion returns an error. +func (s *Store) Watch(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) { + if opts.ResourceVersion != "" { + return nil, storage.NewInvalidError(field.ErrorList{ + field.Invalid( + field.NewPath("resourceVersion"), + opts.ResourceVersion, + "in-memory store does not support watch resume from resource version", + ), + }) + } + + w := s.watchers.watch(key) + done := w.done // capture before spawning goroutine + + go func() { + select { + case <-ctx.Done(): + w.Stop() + case <-done: + // Watcher was stopped directly; goroutine can exit. + } + }() + + return w, nil +} + +// Get retrieves the object stored at the given key and decodes it into objPtr. +// If the key does not exist and opts.IgnoreNotFound is false, a KeyNotFound +// error is returned. If IgnoreNotFound is true, objPtr is left at its zero value. +func (s *Store) Get(ctx context.Context, key string, opts storage.GetOptions, objPtr runtime.Object) error { + s.mu.RLock() + defer s.mu.RUnlock() + + existing, ok := s.items[key] + if !ok { + if opts.IgnoreNotFound { + return nil + } + + return storage.NewKeyNotFoundError(key, 0) + } + + return s.decode(existing.data, objPtr) +} + +// GetList retrieves all objects whose keys match the given prefix (when +// opts.Recursive is true) or the exact key (otherwise), and populates +// listObj with the matching items. The list's resource version is set to +// the store's current revision. +func (s *Store) GetList(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object) error { + s.mu.RLock() + defer s.mu.RUnlock() + + prefix := key + if opts.Recursive && !strings.HasSuffix(prefix, "/") { + prefix += "/" + } + + var objs []runtime.Object + + for k, it := range s.items { + var match bool + if opts.Recursive { + match = strings.HasPrefix(k, prefix) + } else { + match = k == key + } + + if !match { + continue + } + + obj, err := s.decodeNew(it.data) + if err != nil { + return err + } + + if !predicateEmpty(opts.Predicate) { + matches, err := opts.Predicate.Matches(obj) + if err != nil { + return err + } + + if !matches { + continue + } + } + + objs = append(objs, obj) + } + + if err := meta.SetList(listObj, objs); err != nil { + return err + } + + return s.setListRV(listObj, s.rev) +} + +// GuaranteedUpdate reads the current object at the given key, passes it to +// tryUpdate, and writes the result back. If the key does not exist and +// ignoreNotFound is false, a KeyNotFound error is returned. The operation +// is retried internally if the tryUpdate function returns a retriable error. +func (s *Store) GuaranteedUpdate( + ctx context.Context, + key string, + destination runtime.Object, + ignoreNotFound bool, + preconditions *storage.Preconditions, + tryUpdate storage.UpdateFunc, + cachedExistingObject runtime.Object, +) error { + s.mu.Lock() + defer s.mu.Unlock() + + existing, ok := s.items[key] + + var currentObj runtime.Object + var currentRV uint64 + + if ok { + obj, err := s.decodeNew(existing.data) + if err != nil { + return err + } + + currentObj = obj + currentRV = existing.rv + } else { + if !ignoreNotFound { + return storage.NewKeyNotFoundError(key, 0) + } + + currentObj = destination.DeepCopyObject() + } + + if err := s.checkPreconditions(key, preconditions, currentObj); err != nil { + return err + } + + updated, _, err := tryUpdate(currentObj, storage.ResponseMeta{ResourceVersion: currentRV}) + if err != nil { + return err + } + + s.rev++ + rv := s.rev + + if err := s.Versioner().UpdateObject(updated, rv); err != nil { + return fmt.Errorf("UpdateObject failed: %w", err) + } + + data, err := s.encode(updated) + if err != nil { + return err + } + + s.items[key] = &item{ + key: key, + data: data, + rv: rv, + } + + if err := s.decode(data, destination); err != nil { + return err + } + + evType := watch.Modified + if !ok { + evType = watch.Added + } + + // Deep copy for watcher isolation. + s.watchers.sendLocked(watch.Event{ + Type: evType, + Object: updated.DeepCopyObject(), + }, key) + + return nil +} + +// Stats returns basic storage statistics. Currently reports only the number +// of stored objects. +func (s *Store) Stats(ctx context.Context) (storage.Stats, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + return storage.Stats{ + ObjectCount: int64(len(s.items)), + }, nil +} + +// ReadinessCheck reports whether the store is ready. The in-memory store is +// always ready, so this always returns nil. +func (s *Store) ReadinessCheck() error { + return nil +} + +// RequestWatchProgress is a no-op for the in-memory store. It exists to +// satisfy the storage.Interface and is only meaningful for etcd-backed stores. +func (s *Store) RequestWatchProgress(ctx context.Context) error { + return nil +} + +// GetCurrentResourceVersion returns the store's current monotonic revision. +func (s *Store) GetCurrentResourceVersion(ctx context.Context) (uint64, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + return s.rev, nil +} + +// EnableResourceSizeEstimation is a no-op for the in-memory store. Size +// estimation is only relevant for disk-backed storage backends. +func (s *Store) EnableResourceSizeEstimation(storage.KeysFunc) error { + return nil +} + +// CompactRevision returns the latest observed compacted revision. The +// in-memory store does not perform compaction, so this always returns 0. +func (s *Store) CompactRevision() int64 { + return 0 +} + +// --- internal helpers --- + +// encode serializes an object into bytes using the store's codec. +func (s *Store) encode(obj runtime.Object) ([]byte, error) { + var buf bytes.Buffer + if err := s.codec.Encode(obj, &buf); err != nil { + return nil, fmt.Errorf("encode failed: %w", err) + } + + return buf.Bytes(), nil +} + +// decode deserializes bytes into an existing object using the store's codec. +func (s *Store) decode(data []byte, into runtime.Object) error { + _, _, err := s.codec.Decode(data, nil, into) + if err != nil { + return fmt.Errorf("decode failed: %w", err) + } + + return nil +} + +// decodeNew deserializes bytes into a new object allocated by the codec. +func (s *Store) decodeNew(data []byte) (runtime.Object, error) { + obj, _, err := s.codec.Decode(data, nil, nil) + if err != nil { + return nil, fmt.Errorf("decode failed: %w", err) + } + + return obj, nil +} + +// setListRV sets the resource version on a list object using the versioner. +func (s *Store) setListRV(listObj runtime.Object, rv uint64) error { + return s.Versioner().UpdateList(listObj, rv, "", nil) +} + +// predicateEmpty returns true if the predicate performs no filtering. +// It guards against nil Label/Field selectors that would panic in +// SelectionPredicate.Empty(). +func predicateEmpty(p storage.SelectionPredicate) bool { + if p.Label == nil && p.Field == nil { + return true + } + + return p.Empty() +} + +// checkPreconditions verifies that the given preconditions are met by the +// existing object. Returns an error if UID or ResourceVersion do not match. +func (s *Store) checkPreconditions(key string, preconditions *storage.Preconditions, obj runtime.Object) error { + if preconditions == nil { + return nil + } + + if preconditions.UID != nil { + accessor, err := meta.Accessor(obj) + if err != nil { + return err + } + + if accessor.GetUID() != *preconditions.UID { + return storage.NewInvalidObjError(key, fmt.Sprintf( + "precondition UID mismatch: expected %s, got %s", + *preconditions.UID, accessor.GetUID(), + )) + } + } + + if preconditions.ResourceVersion != nil { + rv, err := s.Versioner().ObjectResourceVersion(obj) + if err != nil { + return err + } + + expectedRV, err := s.Versioner().ParseResourceVersion(*preconditions.ResourceVersion) + if err != nil { + return err + } + + if rv != expectedRV { + return storage.NewInvalidObjError(key, fmt.Sprintf( + "precondition ResourceVersion mismatch: expected %d, got %d", + expectedRV, rv, + )) + } + } + + return nil +} diff --git a/pkg/storage/memory/store_test.go b/pkg/storage/memory/store_test.go new file mode 100644 index 000000000..ffd6edc0f --- /dev/null +++ b/pkg/storage/memory/store_test.go @@ -0,0 +1,794 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memory + +import ( + "context" + "fmt" + "testing" + "time" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/apiserver/pkg/storage" +) + +// codec is the shared codec used by all tests. UnstructuredJSONScheme handles +// encoding and decoding of unstructured.Unstructured objects without needing +// a registered scheme or concrete Go types. +var codec runtime.Codec = unstructured.UnstructuredJSONScheme + +// newTestObject builds an *unstructured.Unstructured with the given name and +// namespace, suitable for storage in the test store. +func newTestObject(name, namespace string) *unstructured.Unstructured { + return &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "v1", + "kind": "GPU", + "metadata": map[string]any{ + "name": name, + "namespace": namespace, + }, + }, + } +} + +func TestStore_CreateAndGet(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + obj := newTestObject("gpu-0", "default") + out := &unstructured.Unstructured{} + + if err := s.Create(ctx, "/gpus/default/gpu-0", obj, out, 0); err != nil { + t.Fatalf("Create failed: %v", err) + } + + // Verify resourceVersion was set on the output object. + rv := out.GetResourceVersion() + if rv == "" { + t.Fatal("expected resourceVersion to be set on out, got empty string") + } + + if rv != "1" { + t.Fatalf("expected resourceVersion '1', got %q", rv) + } + + // Get the object back. + got := &unstructured.Unstructured{} + if err := s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, got); err != nil { + t.Fatalf("Get failed: %v", err) + } + + if got.GetName() != "gpu-0" { + t.Fatalf("expected name 'gpu-0', got %q", got.GetName()) + } + + if got.GetResourceVersion() != "1" { + t.Fatalf("expected resourceVersion '1', got %q", got.GetResourceVersion()) + } +} + +func TestStore_CreateDuplicate(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + obj := newTestObject("gpu-0", "default") + + if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil { + t.Fatalf("first Create failed: %v", err) + } + + err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0) + if err == nil { + t.Fatal("expected error on duplicate Create, got nil") + } + + if !storage.IsExist(err) { + t.Fatalf("expected IsExist error, got: %v", err) + } +} + +func TestStore_GetNotFound(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + got := &unstructured.Unstructured{} + err := s.Get(ctx, "/gpus/default/gpu-missing", storage.GetOptions{}, got) + + if err == nil { + t.Fatal("expected error on Get for missing key, got nil") + } + + if !storage.IsNotFound(err) { + t.Fatalf("expected IsNotFound error, got: %v", err) + } +} + +func TestStore_GetList(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + // Create 3 objects under the same prefix. + for _, name := range []string{"gpu-0", "gpu-1", "gpu-2"} { + obj := newTestObject(name, "default") + if err := s.Create(ctx, "/gpus/default/"+name, obj, nil, 0); err != nil { + t.Fatalf("Create %s failed: %v", name, err) + } + } + + list := &unstructured.UnstructuredList{} + opts := storage.ListOptions{ + Recursive: true, + Predicate: storage.SelectionPredicate{}, + } + + if err := s.GetList(ctx, "/gpus/default", opts, list); err != nil { + t.Fatalf("GetList failed: %v", err) + } + + if len(list.Items) != 3 { + t.Fatalf("expected 3 items, got %d", len(list.Items)) + } + + // Verify the list has a resource version. + if list.GetResourceVersion() == "" { + t.Fatal("expected list resourceVersion to be set") + } +} + +func TestStore_GuaranteedUpdate(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + obj := newTestObject("gpu-0", "default") + if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil { + t.Fatalf("Create failed: %v", err) + } + + dest := &unstructured.Unstructured{} + err := s.GuaranteedUpdate(ctx, "/gpus/default/gpu-0", dest, false, nil, + func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { + u := input.(*unstructured.Unstructured) + labels := u.GetLabels() + if labels == nil { + labels = make(map[string]string) + } + + labels["test-key"] = "test-value" + u.SetLabels(labels) + + return u, nil, nil + }, nil) + if err != nil { + t.Fatalf("GuaranteedUpdate failed: %v", err) + } + + // Verify the label was persisted. + got := &unstructured.Unstructured{} + if err := s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, got); err != nil { + t.Fatalf("Get after update failed: %v", err) + } + + labels := got.GetLabels() + if labels["test-key"] != "test-value" { + t.Fatalf("expected label 'test-key'='test-value', got labels: %v", labels) + } + + // Verify resourceVersion was incremented. + if got.GetResourceVersion() != "2" { + t.Fatalf("expected resourceVersion '2' after update, got %q", got.GetResourceVersion()) + } +} + +func TestStore_GuaranteedUpdate_NotFound(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + dest := &unstructured.Unstructured{} + err := s.GuaranteedUpdate(ctx, "/gpus/default/gpu-missing", dest, false, nil, + func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { + return input, nil, nil + }, nil) + + if err == nil { + t.Fatal("expected error on GuaranteedUpdate for missing key with ignoreNotFound=false") + } + + if !storage.IsNotFound(err) { + t.Fatalf("expected IsNotFound error, got: %v", err) + } +} + +func TestStore_Delete(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + obj := newTestObject("gpu-0", "default") + if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil { + t.Fatalf("Create failed: %v", err) + } + + out := &unstructured.Unstructured{} + err := s.Delete(ctx, "/gpus/default/gpu-0", out, nil, nil, nil, storage.DeleteOptions{}) + if err != nil { + t.Fatalf("Delete failed: %v", err) + } + + if out.GetName() != "gpu-0" { + t.Fatalf("expected deleted object name 'gpu-0', got %q", out.GetName()) + } + + // Verify the object is gone. + got := &unstructured.Unstructured{} + err = s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, got) + + if err == nil { + t.Fatal("expected NotFound error after delete, got nil") + } + + if !storage.IsNotFound(err) { + t.Fatalf("expected IsNotFound error, got: %v", err) + } +} + +func TestStore_DeleteNotFound(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + out := &unstructured.Unstructured{} + err := s.Delete(ctx, "/gpus/default/gpu-missing", out, nil, nil, nil, storage.DeleteOptions{}) + + if err == nil { + t.Fatal("expected error on Delete for missing key, got nil") + } + + if !storage.IsNotFound(err) { + t.Fatalf("expected IsNotFound error, got: %v", err) + } +} + +func TestStore_Watch(t *testing.T) { + s := NewStore(codec) + ctx := t.Context() + + // Watch subscription is synchronous — the watcher is registered before + // Watch() returns. The subsequent Create() will acquire the store lock + // and broadcast to all registered watchers, including ours. + w, err := s.Watch(ctx, "/gpus/default/", storage.ListOptions{}) + if err != nil { + t.Fatalf("Watch failed: %v", err) + } + + defer w.Stop() + + // Create object — guaranteed to notify our watcher. + obj := newTestObject("gpu-0", "default") + if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil { + t.Fatalf("Create failed: %v", err) + } + + select { + case ev := <-w.ResultChan(): + if ev.Type != watch.Added { + t.Fatalf("expected ADDED event, got %v", ev.Type) + } + + u, ok := ev.Object.(*unstructured.Unstructured) + if !ok { + t.Fatalf("expected *unstructured.Unstructured, got %T", ev.Object) + } + + if u.GetName() != "gpu-0" { + t.Fatalf("expected event object name 'gpu-0', got %q", u.GetName()) + } + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for watch event") + } +} + +func TestStore_Watch_Delete(t *testing.T) { + s := NewStore(codec) + ctx := t.Context() + + // Create the object first, before starting the watch. + obj := newTestObject("gpu-0", "default") + if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil { + t.Fatalf("Create failed: %v", err) + } + + w, err := s.Watch(ctx, "/gpus/default/", storage.ListOptions{}) + if err != nil { + t.Fatalf("Watch failed: %v", err) + } + + defer w.Stop() + + // Delete the object; the watcher should receive a DELETED event. + out := &unstructured.Unstructured{} + if err := s.Delete(ctx, "/gpus/default/gpu-0", out, nil, nil, nil, storage.DeleteOptions{}); err != nil { + t.Fatalf("Delete failed: %v", err) + } + + select { + case ev := <-w.ResultChan(): + if ev.Type != watch.Deleted { + t.Fatalf("expected DELETED event, got %v", ev.Type) + } + + u, ok := ev.Object.(*unstructured.Unstructured) + if !ok { + t.Fatalf("expected *unstructured.Unstructured, got %T", ev.Object) + } + + if u.GetName() != "gpu-0" { + t.Fatalf("expected event object name 'gpu-0', got %q", u.GetName()) + } + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for DELETED watch event") + } +} + +func TestStore_Stats(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + for _, name := range []string{"gpu-0", "gpu-1"} { + obj := newTestObject(name, "default") + if err := s.Create(ctx, "/gpus/default/"+name, obj, nil, 0); err != nil { + t.Fatalf("Create %s failed: %v", name, err) + } + } + + stats, err := s.Stats(ctx) + if err != nil { + t.Fatalf("Stats failed: %v", err) + } + + if stats.ObjectCount != 2 { + t.Fatalf("expected ObjectCount 2, got %d", stats.ObjectCount) + } +} + +func TestStore_ReadinessCheck(t *testing.T) { + s := NewStore(codec) + + if err := s.ReadinessCheck(); err != nil { + t.Fatalf("ReadinessCheck failed: %v", err) + } +} + +func TestStore_GetCurrentResourceVersion(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + rv0, err := s.GetCurrentResourceVersion(ctx) + if err != nil { + t.Fatalf("GetCurrentResourceVersion failed: %v", err) + } + + if rv0 != 0 { + t.Fatalf("expected initial resourceVersion 0, got %d", rv0) + } + + // Create two objects; each should increment the revision. + for _, name := range []string{"gpu-0", "gpu-1"} { + obj := newTestObject(name, "default") + if err := s.Create(ctx, "/gpus/default/"+name, obj, nil, 0); err != nil { + t.Fatalf("Create %s failed: %v", name, err) + } + } + + rv2, err := s.GetCurrentResourceVersion(ctx) + if err != nil { + t.Fatalf("GetCurrentResourceVersion failed: %v", err) + } + + if rv2 != 2 { + t.Fatalf("expected resourceVersion 2 after two creates, got %d", rv2) + } +} + +func TestStore_DeleteWithPreconditions(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + obj := newTestObject("gpu-0", "default") + obj.SetUID("test-uid-123") + + out := &unstructured.Unstructured{} + if err := s.Create(ctx, "/gpus/default/gpu-0", obj, out, 0); err != nil { + t.Fatalf("Create failed: %v", err) + } + + // Delete with wrong UID precondition should fail. + wrongUID := types.UID("wrong-uid") + precond := &storage.Preconditions{UID: &wrongUID} + delOut := &unstructured.Unstructured{} + err := s.Delete(ctx, "/gpus/default/gpu-0", delOut, precond, nil, nil, storage.DeleteOptions{}) + if err == nil { + t.Fatal("expected error on Delete with wrong UID precondition, got nil") + } + + // Verify the object still exists. + got := &unstructured.Unstructured{} + if err := s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, got); err != nil { + t.Fatalf("Get after failed delete should succeed: %v", err) + } + + // Delete with correct UID precondition should succeed. + correctUID := types.UID("test-uid-123") + precond = &storage.Preconditions{UID: &correctUID} + delOut = &unstructured.Unstructured{} + if err := s.Delete(ctx, "/gpus/default/gpu-0", delOut, precond, nil, nil, storage.DeleteOptions{}); err != nil { + t.Fatalf("Delete with correct UID precondition failed: %v", err) + } + + if delOut.GetName() != "gpu-0" { + t.Fatalf("expected deleted object name 'gpu-0', got %q", delOut.GetName()) + } + + // Verify the object is gone. + err = s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, &unstructured.Unstructured{}) + if err == nil { + t.Fatal("expected NotFound error after delete, got nil") + } + + if !storage.IsNotFound(err) { + t.Fatalf("expected IsNotFound error, got: %v", err) + } +} + +func TestStore_GuaranteedUpdate_Preconditions(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + obj := newTestObject("gpu-0", "default") + obj.SetUID("known-uid-456") + + if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil { + t.Fatalf("Create failed: %v", err) + } + + // GuaranteedUpdate with wrong UID precondition should fail. + wrongUID := types.UID("wrong-uid") + precond := &storage.Preconditions{UID: &wrongUID} + dest := &unstructured.Unstructured{} + err := s.GuaranteedUpdate(ctx, "/gpus/default/gpu-0", dest, false, precond, + func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { + return input, nil, nil + }, nil) + if err == nil { + t.Fatal("expected error on GuaranteedUpdate with wrong UID precondition, got nil") + } + + // Verify the object was not modified (still at resourceVersion 1). + got := &unstructured.Unstructured{} + if err := s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, got); err != nil { + t.Fatalf("Get failed: %v", err) + } + + if got.GetResourceVersion() != "1" { + t.Fatalf("expected resourceVersion '1' (unmodified), got %q", got.GetResourceVersion()) + } + + // GuaranteedUpdate with correct UID precondition should succeed. + correctUID := types.UID("known-uid-456") + precond = &storage.Preconditions{UID: &correctUID} + dest = &unstructured.Unstructured{} + err = s.GuaranteedUpdate(ctx, "/gpus/default/gpu-0", dest, false, precond, + func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { + u := input.(*unstructured.Unstructured) + labels := u.GetLabels() + if labels == nil { + labels = make(map[string]string) + } + + labels["updated"] = "true" + u.SetLabels(labels) + + return u, nil, nil + }, nil) + if err != nil { + t.Fatalf("GuaranteedUpdate with correct UID precondition failed: %v", err) + } + + // Verify the update was applied. + got = &unstructured.Unstructured{} + if err := s.Get(ctx, "/gpus/default/gpu-0", storage.GetOptions{}, got); err != nil { + t.Fatalf("Get after update failed: %v", err) + } + + if got.GetLabels()["updated"] != "true" { + t.Fatalf("expected label 'updated'='true', got labels: %v", got.GetLabels()) + } + + if got.GetResourceVersion() != "2" { + t.Fatalf("expected resourceVersion '2' after update, got %q", got.GetResourceVersion()) + } +} + +func TestStore_GuaranteedUpdate_IgnoreNotFound(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + dest := &unstructured.Unstructured{} + var receivedEmpty bool + err := s.GuaranteedUpdate(ctx, "/gpus/default/gpu-new", dest, true, nil, + func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { + u := input.(*unstructured.Unstructured) + // When ignoreNotFound is true and the key doesn't exist, the input + // should be a zero-value object (deep copy of destination). + if u.GetName() == "" && u.GetNamespace() == "" { + receivedEmpty = true + } + + // Populate the object so it gets created. + u.SetUnstructuredContent(map[string]any{ + "apiVersion": "v1", + "kind": "GPU", + "metadata": map[string]any{ + "name": "gpu-new", + "namespace": "default", + }, + }) + + return u, nil, nil + }, nil) + if err != nil { + t.Fatalf("GuaranteedUpdate with ignoreNotFound=true failed: %v", err) + } + + if !receivedEmpty { + t.Fatal("expected tryUpdate to receive a zero-value object, but it did not") + } + + // Verify the object was created and can be retrieved. + got := &unstructured.Unstructured{} + if err := s.Get(ctx, "/gpus/default/gpu-new", storage.GetOptions{}, got); err != nil { + t.Fatalf("Get after GuaranteedUpdate (ignoreNotFound) failed: %v", err) + } + + if got.GetName() != "gpu-new" { + t.Fatalf("expected name 'gpu-new', got %q", got.GetName()) + } + + if got.GetResourceVersion() == "" { + t.Fatal("expected resourceVersion to be set, got empty string") + } +} + +func TestStore_Watch_Modified(t *testing.T) { + s := NewStore(codec) + ctx := t.Context() + + w, err := s.Watch(ctx, "/gpus/default/", storage.ListOptions{}) + if err != nil { + t.Fatalf("Watch failed: %v", err) + } + + defer w.Stop() + + // Create an object. + obj := newTestObject("gpu-0", "default") + if err := s.Create(ctx, "/gpus/default/gpu-0", obj, nil, 0); err != nil { + t.Fatalf("Create failed: %v", err) + } + + // Consume the ADDED event. + select { + case ev := <-w.ResultChan(): + if ev.Type != watch.Added { + t.Fatalf("expected ADDED event, got %v", ev.Type) + } + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for ADDED watch event") + } + + // Update the object via GuaranteedUpdate. + dest := &unstructured.Unstructured{} + err = s.GuaranteedUpdate(ctx, "/gpus/default/gpu-0", dest, false, nil, + func(input runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { + u := input.(*unstructured.Unstructured) + labels := u.GetLabels() + if labels == nil { + labels = make(map[string]string) + } + + labels["modified"] = "true" + u.SetLabels(labels) + + return u, nil, nil + }, nil) + if err != nil { + t.Fatalf("GuaranteedUpdate failed: %v", err) + } + + // Verify a MODIFIED event is received. + select { + case ev := <-w.ResultChan(): + if ev.Type != watch.Modified { + t.Fatalf("expected MODIFIED event, got %v", ev.Type) + } + + u, ok := ev.Object.(*unstructured.Unstructured) + if !ok { + t.Fatalf("expected *unstructured.Unstructured, got %T", ev.Object) + } + + if u.GetLabels()["modified"] != "true" { + t.Fatalf("expected label 'modified'='true' on event object, got labels: %v", u.GetLabels()) + } + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for MODIFIED watch event") + } +} + +func TestStore_Watch_KeyPrefixFiltering(t *testing.T) { + s := NewStore(codec) + ctx := t.Context() + + // Watch only the /gpus/default/ prefix. + w, err := s.Watch(ctx, "/gpus/default/", storage.ListOptions{}) + if err != nil { + t.Fatalf("Watch failed: %v", err) + } + + defer w.Stop() + + // Create an object under a different namespace; should NOT produce an event. + otherObj := newTestObject("gpu-0", "other-ns") + if err := s.Create(ctx, "/gpus/other-ns/gpu-0", otherObj, nil, 0); err != nil { + t.Fatalf("Create other-ns object failed: %v", err) + } + + // Verify no event is received within a short timeout. + select { + case ev := <-w.ResultChan(): + t.Fatalf("expected no event for other-ns object, but got %v event", ev.Type) + case <-time.After(500 * time.Millisecond): + // Good: no event received. + } + + // Create an object under the watched prefix; SHOULD produce an ADDED event. + defaultObj := newTestObject("gpu-0", "default") + if err := s.Create(ctx, "/gpus/default/gpu-0", defaultObj, nil, 0); err != nil { + t.Fatalf("Create default object failed: %v", err) + } + + select { + case ev := <-w.ResultChan(): + if ev.Type != watch.Added { + t.Fatalf("expected ADDED event, got %v", ev.Type) + } + + u, ok := ev.Object.(*unstructured.Unstructured) + if !ok { + t.Fatalf("expected *unstructured.Unstructured, got %T", ev.Object) + } + + if u.GetName() != "gpu-0" { + t.Fatalf("expected event object name 'gpu-0', got %q", u.GetName()) + } + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for ADDED watch event for default namespace object") + } +} + +func TestStore_GetIgnoreNotFound(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + got := &unstructured.Unstructured{} + err := s.Get(ctx, "/gpus/default/gpu-missing", storage.GetOptions{IgnoreNotFound: true}, got) + if err != nil { + t.Fatalf("expected no error with IgnoreNotFound=true, got: %v", err) + } + + // The object should be at its zero value (no name set). + if got.GetName() != "" { + t.Fatalf("expected empty name on zero-value object, got %q", got.GetName()) + } +} + +func TestStore_GetList_NonRecursive(t *testing.T) { + s := NewStore(codec) + ctx := context.Background() + + // Create two objects under the same prefix. + for _, name := range []string{"gpu-0", "gpu-1"} { + obj := newTestObject(name, "default") + if err := s.Create(ctx, "/gpus/default/"+name, obj, nil, 0); err != nil { + t.Fatalf("Create %s failed: %v", name, err) + } + } + + // GetList with Recursive=false on an exact key should return only that one item. + list := &unstructured.UnstructuredList{} + opts := storage.ListOptions{ + Recursive: false, + Predicate: storage.SelectionPredicate{}, + } + + if err := s.GetList(ctx, "/gpus/default/gpu-0", opts, list); err != nil { + t.Fatalf("GetList failed: %v", err) + } + + if len(list.Items) != 1 { + t.Fatalf("expected 1 item with non-recursive GetList, got %d", len(list.Items)) + } + + if list.Items[0].GetName() != "gpu-0" { + t.Fatalf("expected item name 'gpu-0', got %q", list.Items[0].GetName()) + } +} + +func TestStore_ImplementsInterface(t *testing.T) { + // Compile-time check that *Store satisfies storage.Interface. + var _ storage.Interface = (*Store)(nil) +} + +func TestStore_Watch_RejectsResourceVersion(t *testing.T) { + s := NewStore(codec) + ctx := t.Context() + + _, err := s.Watch(ctx, "/gpus/default/", storage.ListOptions{ + ResourceVersion: "5", + }) + if err == nil { + t.Fatal("expected error when Watch is called with non-empty ResourceVersion, got nil") + } +} + +func TestStore_Watch_EventDropOnFullBuffer(t *testing.T) { + s := NewStore(codec) + ctx := t.Context() + + w, err := s.Watch(ctx, "/gpus/default/", storage.ListOptions{}) + if err != nil { + t.Fatalf("Watch failed: %v", err) + } + defer w.Stop() + + // Fill the channel buffer (watchChannelSize = 100) plus overflow. + for i := 0; i < watchChannelSize+10; i++ { + name := fmt.Sprintf("gpu-%d", i) + obj := newTestObject(name, "default") + if err := s.Create(ctx, "/gpus/default/"+name, obj, nil, 0); err != nil { + t.Fatalf("Create %s failed: %v", name, err) + } + } + + // Drain the channel. We should get exactly watchChannelSize events + // (the rest were dropped because the buffer was full). + received := 0 + for { + select { + case _, ok := <-w.ResultChan(): + if !ok { + t.Fatal("channel unexpectedly closed") + } + received++ + default: + goto done + } + } +done: + if received != watchChannelSize { + t.Fatalf("expected %d events (buffer size), got %d", watchChannelSize, received) + } +} diff --git a/pkg/storage/memory/watch.go b/pkg/storage/memory/watch.go new file mode 100644 index 000000000..6d6f7dd9b --- /dev/null +++ b/pkg/storage/memory/watch.go @@ -0,0 +1,130 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memory + +import ( + "strings" + "sync" + "sync/atomic" + + "k8s.io/apimachinery/pkg/watch" + "k8s.io/klog/v2" +) + +const watchChannelSize = 100 + +// watchManager tracks active watchers and broadcasts events to them. +// It uses its own mutex, separate from Store.mu, because sendLocked +// is called while the Store write lock is held. +type watchManager struct { + mu sync.Mutex + watchers map[int]*memoryWatcher + nextID int + watchBufferSize int +} + +func newWatchManager(bufferSize int) *watchManager { + return &watchManager{ + watchers: make(map[int]*memoryWatcher), + watchBufferSize: bufferSize, + } +} + +// watch creates a new watcher for the given key prefix and registers it. +// The caller must cancel the context or call Stop() to clean up. +func (wm *watchManager) watch(key string) *memoryWatcher { + wm.mu.Lock() + defer wm.mu.Unlock() + + id := wm.nextID + wm.nextID++ + + w := &memoryWatcher{ + id: id, + key: key, + ch: make(chan watch.Event, wm.watchBufferSize), + done: make(chan struct{}), + parent: wm, + } + + wm.watchers[id] = w + + return w +} + +// sendLocked broadcasts an event to all registered watchers whose key prefix +// matches the event's object key. This method is called while Store.mu is +// held (write lock), so it uses its own mutex for watcher iteration. +// Sends are non-blocking: if a watcher's channel is full, the event is dropped. +func (wm *watchManager) sendLocked(ev watch.Event, objectKey string) { + wm.mu.Lock() + defer wm.mu.Unlock() + + for _, w := range wm.watchers { + if !strings.HasPrefix(objectKey, w.key) { + continue + } + + select { + case w.ch <- ev: + default: + w.droppedEvents.Add(1) + } + } +} + +// remove unregisters a watcher by ID. +func (wm *watchManager) remove(id int) { + wm.mu.Lock() + defer wm.mu.Unlock() + + delete(wm.watchers, id) +} + +// memoryWatcher implements watch.Interface for in-memory storage events. +type memoryWatcher struct { + id int + key string + ch chan watch.Event + done chan struct{} + once sync.Once + parent *watchManager + droppedEvents atomic.Int64 +} + +var _ watch.Interface = (*memoryWatcher)(nil) + +// ResultChan returns the channel that receives watch events. +func (w *memoryWatcher) ResultChan() <-chan watch.Event { + return w.ch +} + +// Stop terminates the watcher, unregisters it from the parent manager, +// and closes the result channel. It is safe to call multiple times. +func (w *memoryWatcher) Stop() { + w.once.Do(func() { + if dropped := w.droppedEvents.Load(); dropped > 0 { + klog.V(2).InfoS("Watch stopped with dropped events", + "watcherID", w.id, + "key", w.key, + "droppedEvents", dropped, + ) + } + + w.parent.remove(w.id) + close(w.done) + close(w.ch) + }) +} diff --git a/pkg/storage/storagebackend/config.go b/pkg/storage/storagebackend/config.go index f6867f337..840f52708 100644 --- a/pkg/storage/storagebackend/config.go +++ b/pkg/storage/storagebackend/config.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -28,6 +28,10 @@ type Config struct { KineSocketPath string DatabaseDir string + // InMemory skips Kine/SQLite entirely. Services supply their own + // in-memory storage.Interface, so the backend only needs to report ready. + InMemory bool + StorageConfig apistorage.Config } @@ -40,6 +44,7 @@ func NewConfig(ctx context.Context, opts options.CompletedOptions) (*Config, err KineConfig: opts.KineConfig, KineSocketPath: opts.KineSocketPath, DatabaseDir: opts.DatabaseDir, + InMemory: opts.InMemory, } if err := opts.ApplyTo(&config.StorageConfig); err != nil { diff --git a/pkg/storage/storagebackend/config_test.go b/pkg/storage/storagebackend/config_test.go index ed5b5fcc3..e665891c8 100644 --- a/pkg/storage/storagebackend/config_test.go +++ b/pkg/storage/storagebackend/config_test.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ func TestNewConfig(t *testing.T) { ctx := context.Background() opts := options.NewOptions() + opts.InMemory = false opts.DatabasePath = "/tmp/nvsentinel/test.db" completedOpts, err := opts.Complete() diff --git a/pkg/storage/storagebackend/options/options.go b/pkg/storage/storagebackend/options/options.go index 306d02b4f..8951abbcf 100644 --- a/pkg/storage/storagebackend/options/options.go +++ b/pkg/storage/storagebackend/options/options.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -28,6 +28,10 @@ import ( ) type Options struct { + // InMemory skips the Kine/SQLite storage backend entirely. + // When true, services provide their own in-memory storage.Interface. + InMemory bool + DatabasePath string CompactionInterval time.Duration CompactionBatchSize int64 @@ -49,6 +53,7 @@ type CompletedOptions struct { func NewOptions() *Options { return &Options{ + InMemory: true, DatabasePath: "/var/lib/nvidia-device-api/state.db", CompactionInterval: 5 * time.Minute, CompactionBatchSize: 1000, @@ -64,6 +69,9 @@ func (o *Options) AddFlags(fss *cliflag.NamedFlagSets) { storageFs := fss.FlagSet("storage") + storageFs.BoolVar(&o.InMemory, "in-memory", o.InMemory, + "Use in-memory storage instead of SQLite/Kine. Services provide their own storage.Interface.") + storageFs.StringVar(&o.DatabasePath, "database-path", o.DatabasePath, "The path to the SQLite database file. Must be an absolute path.") @@ -80,6 +88,12 @@ func (o *Options) Complete() (CompletedOptions, error) { return CompletedOptions{}, nil } + // In-memory mode skips all Kine/SQLite configuration. + if o.InMemory { + completed := completedOptions{Options: *o} + return CompletedOptions{completedOptions: &completed}, nil + } + if o.KineSocketPath == "" { o.KineSocketPath = "/var/run/nvidia-device-api/kine.sock" } @@ -127,6 +141,11 @@ func (o *Options) Validate() []error { return nil } + // In-memory mode requires no Kine/SQLite configuration. + if o.InMemory { + return nil + } + allErrors := []error{} if o.DatabasePath == "" { diff --git a/pkg/storage/storagebackend/options/options_test.go b/pkg/storage/storagebackend/options/options_test.go index 9079915fb..e5cfe1e83 100644 --- a/pkg/storage/storagebackend/options/options_test.go +++ b/pkg/storage/storagebackend/options/options_test.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -61,6 +61,7 @@ func TestAddFlags(t *testing.T) { func TestComplete(t *testing.T) { t.Run("Default assignments", func(t *testing.T) { opts := NewOptions() + opts.InMemory = false opts.DatabasePath = "" opts.KineSocketPath = "" @@ -85,6 +86,7 @@ func TestComplete(t *testing.T) { t.Run("Trims unix prefix from SocketPath", func(t *testing.T) { opts := NewOptions() + opts.InMemory = false opts.KineSocketPath = "unix:///tmp/test.sock" completed, _ := opts.Complete() @@ -95,6 +97,7 @@ func TestComplete(t *testing.T) { t.Run("Maps intervals to KineConfig", func(t *testing.T) { opts := NewOptions() + opts.InMemory = false opts.CompactionInterval = 10 * time.Minute opts.WatchProgressNotifyInterval = 15 * time.Second @@ -181,6 +184,7 @@ func TestValidate(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { opts := NewOptions() + opts.InMemory = false tt.modify(opts) completed, err := opts.Complete() @@ -211,6 +215,7 @@ func TestValidate(t *testing.T) { func TestApplyTo(t *testing.T) { opts := NewOptions() + opts.InMemory = false completed, _ := opts.Complete() storageCfg := &apistorage.Config{} diff --git a/pkg/storage/storagebackend/storage.go b/pkg/storage/storagebackend/storage.go index 2502efac9..ab790b4f5 100644 --- a/pkg/storage/storagebackend/storage.go +++ b/pkg/storage/storagebackend/storage.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ import ( "path/filepath" "strings" "sync/atomic" + "syscall" "time" "github.com/k3s-io/kine/pkg/endpoint" @@ -39,6 +40,10 @@ type Storage struct { StorageConfig apistorage.Config ETCDConfig *endpoint.ETCDConfig + // InMemory skips Kine/SQLite entirely. When true, the storage backend + // reports ready immediately and services use their own in-memory storage. + InMemory bool + isReady atomic.Bool } @@ -52,10 +57,15 @@ func (c *CompletedConfig) New() (*Storage, error) { KineSocketPath: c.KineSocketPath, DatabaseDir: c.DatabaseDir, StorageConfig: c.StorageConfig, + InMemory: c.InMemory, }, nil } func (s *Storage) PrepareRun(ctx context.Context) (preparedStorage, error) { + if s.InMemory { + return preparedStorage{s}, nil + } + if err := s.prepareFilesystem(ctx); err != nil { return preparedStorage{}, err } @@ -101,9 +111,22 @@ func (s *preparedStorage) Run(ctx context.Context) error { func (s *Storage) run(ctx context.Context) error { logger := klog.FromContext(ctx) + if s.InMemory { + logger.V(2).Info("Starting in-memory storage backend (no persistence)") + s.isReady.Store(true) + <-ctx.Done() + logger.Info("Shutting down in-memory storage backend") + s.isReady.Store(false) + return nil + } + logger.V(2).Info("Starting storage backend", "database", s.KineConfig.Endpoint) s.isReady.Store(false) + // Restrict permissions on new files (socket) before Kine creates it. + oldUmask := syscall.Umask(0117) // Creates socket as 0660 from the start + defer syscall.Umask(oldUmask) + etcdConfig, err := endpoint.Listen(ctx, s.KineConfig) if err != nil { return fmt.Errorf("failed to start storage backend: %w", err) @@ -114,7 +137,7 @@ func (s *Storage) run(ctx context.Context) error { socketPath := strings.TrimPrefix(s.KineSocketPath, "unix://") defer func() { if err := netutils.CleanupUDS(socketPath); err != nil { - klog.V(2).ErrorS(err, "Failed to cleanup socket", "path", socketPath) + klog.ErrorS(err, "Failed to cleanup kine socket", "path", socketPath) } }() @@ -157,8 +180,14 @@ func (s *Storage) waitForSocket(ctx context.Context) error { } conn.Close() //nolint:wsl_v5 + //nolint:gosec // G302: 0660 intentional — server and provider share a group if err := os.Chmod(socketPath, 0660); err != nil { + if os.IsPermission(err) { + return false, fmt.Errorf("failed to secure kine socket %q: %w", socketPath, err) + } + logger.V(4).Error(err, "Failed to secure socket, retrying", "path", socketPath) + return false, nil } @@ -169,8 +198,6 @@ func (s *Storage) waitForSocket(ctx context.Context) error { return fmt.Errorf("timed out waiting to connect to socket: %w", err) } - s.isReady.Store(true) - return nil } diff --git a/pkg/storage/storagebackend/storage_test.go b/pkg/storage/storagebackend/storage_test.go index b992d0602..7d446eadf 100644 --- a/pkg/storage/storagebackend/storage_test.go +++ b/pkg/storage/storagebackend/storage_test.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -113,6 +113,43 @@ func TestStorage_SocketInUse(t *testing.T) { } } +func TestStorage_InMemoryMode(t *testing.T) { + s := &Storage{InMemory: true} + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + ps, err := s.PrepareRun(ctx) + if err != nil { + t.Fatalf("PrepareRun failed: %v", err) + } + + runErr := make(chan error, 1) + go func() { + runErr <- ps.Run(ctx) + }() + + // In-memory should become ready almost immediately. + waitErr := wait.PollUntilContextTimeout(ctx, 10*time.Millisecond, 2*time.Second, true, func(ctx context.Context) (bool, error) { + return s.IsReady(), nil + }) + if waitErr != nil { + t.Fatal("In-memory storage did not become ready") + } + + cancel() + + select { + case <-runErr: + case <-time.After(2 * time.Second): + t.Error("In-memory storage did not shut down gracefully") + } + + if s.IsReady() { + t.Error("In-memory storage should not be ready after shutdown") + } +} + func TestStorage_WaitForSocket_Timeout(t *testing.T) { socketPath := testutils.NewUnixAddr(t) socketURL := "unix://" + socketPath diff --git a/pkg/testutil/grpcserver.go b/pkg/testutil/grpcserver.go new file mode 100644 index 000000000..3e9971474 --- /dev/null +++ b/pkg/testutil/grpcserver.go @@ -0,0 +1,118 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package testutil provides shared test infrastructure for gRPC integration tests. +package testutil + +import ( + "context" + "net" + "testing" + + clientset "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned" + gpuclient "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned/typed/device/v1alpha1" + + pb "github.com/nvidia/nvsentinel/internal/generated/device/v1alpha1" + svc "github.com/nvidia/nvsentinel/pkg/services/device/v1alpha1" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/test/bufconn" + apistorage "k8s.io/apiserver/pkg/storage/storagebackend" +) + +// NewTestGPUClient creates a bufconn-backed gRPC client for testing. +// It spins up a real gRPC server with the GPU service backed by in-memory storage. +// All resources are cleaned up when t finishes. +func NewTestGPUClient(t *testing.T) pb.GpuServiceClient { + t.Helper() + + lis := bufconn.Listen(1024 * 1024) + srv := grpc.NewServer() + + provider := svc.NewGPUServiceProvider() + service, err := provider.Install(srv, apistorage.Config{}) + if err != nil { + t.Fatalf("failed to install GPU service: %v", err) + } + + go func() { + if err := srv.Serve(lis); err != nil { + t.Logf("server stopped: %v", err) + } + }() + + conn, err := grpc.NewClient( + "passthrough:///bufconn", + grpc.WithContextDialer(func(context.Context, string) (net.Conn, error) { + return lis.Dial() + }), + grpc.WithTransportCredentials(insecure.NewCredentials()), + ) + if err != nil { + t.Fatalf("failed to create gRPC client: %v", err) + } + + t.Cleanup(func() { + conn.Close() + service.Cleanup() + srv.Stop() + lis.Close() + }) + + return pb.NewGpuServiceClient(conn) +} + +// NewTestGPUTypedClient creates a bufconn-backed typed GPU client for testing. +// It spins up a real gRPC server with the GPU service backed by in-memory storage, +// and returns a GPUInterface from the generated client SDK. +// All resources are cleaned up when t finishes. +func NewTestGPUTypedClient(t *testing.T) gpuclient.GPUInterface { + t.Helper() + + lis := bufconn.Listen(1024 * 1024) + srv := grpc.NewServer() + + provider := svc.NewGPUServiceProvider() + service, err := provider.Install(srv, apistorage.Config{}) + if err != nil { + t.Fatalf("failed to install GPU service: %v", err) + } + + go func() { + if err := srv.Serve(lis); err != nil { + t.Logf("server stopped: %v", err) + } + }() + + conn, err := grpc.NewClient( + "passthrough:///bufconn", + grpc.WithContextDialer(func(context.Context, string) (net.Conn, error) { + return lis.Dial() + }), + grpc.WithTransportCredentials(insecure.NewCredentials()), + ) + if err != nil { + t.Fatalf("failed to create gRPC client: %v", err) + } + + t.Cleanup(func() { + conn.Close() + service.Cleanup() + srv.Stop() + lis.Close() + }) + + cs := clientset.New(conn) + return cs.DeviceV1alpha1().GPUs() +} diff --git a/pkg/testutil/grpcserver_test.go b/pkg/testutil/grpcserver_test.go new file mode 100644 index 000000000..460f3c489 --- /dev/null +++ b/pkg/testutil/grpcserver_test.go @@ -0,0 +1,57 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "testing" + + pb "github.com/nvidia/nvsentinel/internal/generated/device/v1alpha1" +) + +func TestNewTestGPUClient_CreateAndGet(t *testing.T) { + client := NewTestGPUClient(t) + ctx := t.Context() + + const gpuName = "GPU-01234567-89ab-cdef-0123-456789abcdef" + + created, err := client.CreateGpu(ctx, &pb.CreateGpuRequest{ + Gpu: &pb.Gpu{ + Metadata: &pb.ObjectMeta{ + Name: gpuName, + Namespace: "default", + }, + Spec: &pb.GpuSpec{ + Uuid: "GPU-TEST-1", + }, + }, + }) + if err != nil { + t.Fatalf("CreateGpu failed: %v", err) + } + if created.GetMetadata().GetName() != gpuName { + t.Errorf("expected name %q, got %q", gpuName, created.GetMetadata().GetName()) + } + + resp, err := client.GetGpu(ctx, &pb.GetGpuRequest{ + Name: gpuName, + Namespace: "default", + }) + if err != nil { + t.Fatalf("GetGpu failed: %v", err) + } + if resp.GetGpu().GetSpec().GetUuid() != "GPU-TEST-1" { + t.Errorf("expected UUID %q, got %q", "GPU-TEST-1", resp.GetGpu().GetSpec().GetUuid()) + } +} diff --git a/pkg/util/net/uds.go b/pkg/util/net/uds.go index 1083f4352..25072e73b 100644 --- a/pkg/util/net/uds.go +++ b/pkg/util/net/uds.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -55,6 +55,9 @@ func CreateUDSListener(ctx context.Context, socketPath string, perm os.FileMode) lc := net.ListenConfig{} + // Note: There is a residual TOCTOU window between CleanupUDS and Listen. + // This is acceptable because Listen will fail with EADDRINUSE if another + // process binds the socket in that window. lis, err := lc.Listen(ctx, "unix", socketPath) if err != nil { return nil, nil, fmt.Errorf("failed to listen on unix socket %q: %w", socketPath, err) diff --git a/pkg/util/verflag/verflag.go b/pkg/util/verflag/verflag.go index 592a41f71..1dae5d3b9 100644 --- a/pkg/util/verflag/verflag.go +++ b/pkg/util/verflag/verflag.go @@ -1,4 +1,4 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ import ( "strconv" "text/tabwriter" - "github.com/nvidia/nvsentinel/pkg/util/version" + "github.com/nvidia/nvsentinel/pkg/version" "github.com/spf13/pflag" ) @@ -111,7 +111,7 @@ func printVersionTable() { fmt.Fprintf(w, "%s\n", programName) fmt.Fprintf(w, "---\t---\n") - fmt.Fprintf(w, "Version\t%s\n", v.GitVersion) + fmt.Fprintf(w, "Version\t%s\n", v.Version) fmt.Fprintf(w, "GitCommit\t%s\n", v.GitCommit) fmt.Fprintf(w, "BuildDate\t%s\n", v.BuildDate) fmt.Fprintf(w, "GoVersion\t%s\n", v.GoVersion) diff --git a/pkg/util/version/version.go b/pkg/util/version/version.go deleted file mode 100644 index dac336d55..000000000 --- a/pkg/util/version/version.go +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package version - -import ( - "encoding/json" - "fmt" - "net/http" - "runtime" - - utilversion "k8s.io/apimachinery/pkg/util/version" - "k8s.io/component-base/compatibility" -) - -var ( - GitVersion = "v0.0.0-devel" - GitCommit = "unknown" - BuildDate = "unknown" -) - -type Info struct { - GitVersion string - GitCommit string - BuildDate string - GoVersion string - Compiler string - Platform string -} - -func Get() Info { - return Info{ - GitVersion: GitVersion, - GitCommit: GitCommit, - BuildDate: BuildDate, - GoVersion: runtime.Version(), - Compiler: runtime.Compiler, - Platform: fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH), - } -} - -func (i Info) String() string { - return i.GitVersion -} - -// UserAgent returns the standard user agent string for clients. -func UserAgent() string { - return fmt.Sprintf("nvidia-device-api/%s (%s)", GitVersion, Get().Platform) -} - -func RegisterComponent(registry compatibility.ComponentGlobalsRegistry) error { - v, err := utilversion.ParseSemantic(GitVersion) - if err != nil { - v = utilversion.MustParseSemantic("v0.0.1") - } - - binaryVersion := v - emulationVersion := v - minCompatibilityVersion := v - - effectiveVer := compatibility.NewEffectiveVersion( - binaryVersion, - false, - emulationVersion, - minCompatibilityVersion, - ) - - if err := registry.Register("nvidia-device-api", effectiveVer, nil); err != nil { - return fmt.Errorf("failed to register component with compatibility registry: %w", err) - } - - return nil -} - -func Handler() http.Handler { - return http.HandlerFunc(versionHandler) -} - -func versionHandler(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusOK) - _ = json.NewEncoder(w).Encode(Get()) -} diff --git a/pkg/util/version/version_test.go b/pkg/util/version/version_test.go deleted file mode 100644 index 3548c63d9..000000000 --- a/pkg/util/version/version_test.go +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package version - -import ( - "strings" - "testing" - - "k8s.io/component-base/compatibility" -) - -func TestGet(t *testing.T) { - info := Get() - - if info.GitVersion != GitVersion { - t.Errorf("expected GitVersion %s, got %s", GitVersion, info.GitVersion) - } - - if info.GoVersion == "" || info.Platform == "" { - t.Error("runtime info (GoVersion/Platform) should not be empty") - } -} - -func TestUserAgent(t *testing.T) { - ua := UserAgent() - expectedPrefix := "nvidia-device-api/" + GitVersion - - if !strings.HasPrefix(ua, expectedPrefix) { - t.Errorf("UserAgent %s does not start with %s", ua, expectedPrefix) - } -} - -func TestRegisterComponent(t *testing.T) { - tests := []struct { - name string - gitVersion string - }{ - { - name: "valid semver", - gitVersion: "v1.2.3", - }, - { - name: "invalid semver uses fallback", - gitVersion: "development-build", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - oldVersion := GitVersion - GitVersion = tt.gitVersion - defer func() { GitVersion = oldVersion }() - - registry := compatibility.NewComponentGlobalsRegistry() - - defer func() { - if r := recover(); r != nil { - t.Errorf("RegisterComponent panicked for version %s: %v", tt.gitVersion, r) - } - }() - - RegisterComponent(registry) - - effective := registry.EffectiveVersionFor("nvidia-device-api") - if effective == nil { - t.Fatal("component was not registered in the registry") - } - - if effective.BinaryVersion() == nil { - t.Error("EffectiveVersion has nil BinaryVersion") - } - }) - } -} diff --git a/pkg/version/version.go b/pkg/version/version.go new file mode 100644 index 000000000..f2f31aa6f --- /dev/null +++ b/pkg/version/version.go @@ -0,0 +1,98 @@ +// Copyright (c) 2026-2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package version provides version information for the Device API Server. +// These values are set at build time via ldflags. +package version + +import ( + "encoding/json" + "fmt" + "net/http" + "runtime" +) + +// Build information set at compile time via -ldflags. +var ( + // Version is the semantic version of the build. + Version = "dev" + + // GitCommit is the git commit SHA at build time. + GitCommit = "unknown" + + // GitTreeState indicates if the git tree was clean or dirty. + GitTreeState = "unknown" + + // BuildDate is the date of the build in ISO 8601 format. + BuildDate = "unknown" +) + +// Info contains version information. +type Info struct { + Version string `json:"version"` + GitCommit string `json:"gitCommit"` + GitTreeState string `json:"gitTreeState"` + BuildDate string `json:"buildDate"` + GoVersion string `json:"goVersion"` + Compiler string `json:"compiler"` + Platform string `json:"platform"` +} + +// Get returns the version information. +func Get() Info { + return Info{ + Version: Version, + GitCommit: GitCommit, + GitTreeState: GitTreeState, + BuildDate: BuildDate, + GoVersion: runtime.Version(), + Compiler: runtime.Compiler, + Platform: fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH), + } +} + +// String returns version information as a human-readable string. +func (i Info) String() string { + return fmt.Sprintf( + "Version: %s\nGit Commit: %s\nGit Tree State: %s\nBuild Date: %s\nGo Version: %s\nCompiler: %s\nPlatform: %s", + i.Version, + i.GitCommit, + i.GitTreeState, + i.BuildDate, + i.GoVersion, + i.Compiler, + i.Platform, + ) +} + +// Short returns a short version string. +func (i Info) Short() string { + return fmt.Sprintf("%s (%s)", i.Version, i.GitCommit) +} + +// UserAgent returns the standard user agent string for clients. +func UserAgent() string { + return fmt.Sprintf("nvidia-device-api/%s (%s)", Version, Get().Platform) +} + +// Handler returns an HTTP handler that responds with version information as JSON. +func Handler() http.Handler { + return http.HandlerFunc(versionHandler) +} + +func versionHandler(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(Get()) +} diff --git a/pkg/version/version_test.go b/pkg/version/version_test.go new file mode 100644 index 000000000..78c66358e --- /dev/null +++ b/pkg/version/version_test.go @@ -0,0 +1,68 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package version + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +func TestGet(t *testing.T) { + info := Get() + + if info.Version != Version { + t.Errorf("expected Version %s, got %s", Version, info.Version) + } + + if info.GoVersion == "" || info.Platform == "" { + t.Error("runtime info (GoVersion/Platform) should not be empty") + } +} + +func TestUserAgent(t *testing.T) { + ua := UserAgent() + expectedPrefix := "nvidia-device-api/" + Version + + if !strings.HasPrefix(ua, expectedPrefix) { + t.Errorf("UserAgent %s does not start with %s", ua, expectedPrefix) + } +} + +func TestHandler(t *testing.T) { + req := httptest.NewRequest(http.MethodGet, "/version", nil) + w := httptest.NewRecorder() + + Handler().ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Errorf("expected status %d, got %d", http.StatusOK, w.Code) + } + + if ct := w.Header().Get("Content-Type"); ct != "application/json" { + t.Errorf("expected Content-Type application/json, got %s", ct) + } + + var info Info + if err := json.NewDecoder(w.Body).Decode(&info); err != nil { + t.Fatalf("failed to decode response body: %v", err) + } + + if info.Version != Version { + t.Errorf("expected version %s in response, got %s", Version, info.Version) + } +} diff --git a/test/integration/client-go/device/v1alpha1/clientset_test.go b/test/integration/client-go/device/v1alpha1/clientset_test.go deleted file mode 100644 index 6745e3003..000000000 --- a/test/integration/client-go/device/v1alpha1/clientset_test.go +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package v1alpha1_test - -import ( - "context" - "encoding/json" - "fmt" - "strconv" - "testing" - "time" - - devicev1alpha1 "github.com/nvidia/nvsentinel/api/device/v1alpha1" - "github.com/nvidia/nvsentinel/cmd/device-apiserver/app" - "github.com/nvidia/nvsentinel/cmd/device-apiserver/app/options" - "github.com/nvidia/nvsentinel/pkg/client-go/client/versioned" - "github.com/nvidia/nvsentinel/pkg/grpc/client" - "github.com/nvidia/nvsentinel/pkg/util/testutils" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -func TestEndToEnd(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - tmpDir := t.TempDir() - - socketPath := testutils.NewUnixAddr(t) - kineSocket := fmt.Sprintf("unix://%s", testutils.NewUnixAddr(t)) - healthAddr := testutils.GetFreeTCPAddress(t) - - opts := options.NewServerRunOptions() - opts.NodeName = "test-node" - opts.GRPC.BindAddress = "unix://" + socketPath - opts.HealthAddress = healthAddr - opts.Storage.DatabaseDir = tmpDir - opts.Storage.DatabasePath = tmpDir + "state.db" - opts.Storage.KineSocketPath = kineSocket - opts.Storage.KineConfig.Endpoint = fmt.Sprintf("sqlite://%s/db.sqlite", tmpDir) - opts.Storage.KineConfig.Listener = kineSocket - - completed, err := opts.Complete(ctx) - if err != nil { - t.Fatalf("Failed to complete options: %v", err) - } - - go func() { - if err := app.Run(ctx, completed); err != nil && err != context.Canceled { - t.Errorf("Server exited with error: %v", err) - } - }() - - testutils.WaitForStatus(t, healthAddr, "", 5*time.Second, testutils.IsServing) - - config := &client.Config{Target: "unix://" + socketPath} - client, err := versioned.NewForConfig(config) - if err != nil { - t.Fatalf("Failed to create clientset: %v", err) - } - - var created *devicev1alpha1.GPU - - t.Run("Create", func(t *testing.T) { - gpu := &devicev1alpha1.GPU{ - ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-ad2367dd-a40e-6b86-6fc3-c44a2cc92c7e", - }, - Spec: devicev1alpha1.GPUSpec{ - UUID: "GPU-ad2367dd-a40e-6b86-6fc3-c44a2cc92c7e", - }, - Status: devicev1alpha1.GPUStatus{ - Conditions: []metav1.Condition{ - { - Type: "Ready", - Status: metav1.ConditionFalse, - Reason: "DriverNotReaady", - Message: "Driver is posting ready status", - }, - }, - }, - } - - created, err = client.DeviceV1alpha1().GPUs().Create(ctx, gpu, metav1.CreateOptions{}) - if err != nil { - t.Fatalf("Failed to create GPU: %v", err) - } - - // Client generated fields - if created.Kind != "GPU" { - t.Errorf("expected kind 'GPU', got %s", created.Kind) - } - if created.APIVersion != devicev1alpha1.SchemeGroupVersion.String() { - t.Errorf("expected version %s, got %s", devicev1alpha1.SchemeGroupVersion.String(), created.APIVersion) - } - - // Server generated fields - if created.Namespace != "default" { - t.Error("Server failed to set default namespace") - } - if created.UID == "" { - t.Error("Server failed to generate a UID for the GPU") - } - if created.ResourceVersion == "" { - t.Error("Server failed to generate a ResourceVersion") - } - if created.Generation != 1 { - t.Error("Server failed to set initial Generation") - } - if created.CreationTimestamp.IsZero() { - t.Error("Server failed to set a CreationTimestamp") - } - - // Data integrity - if created.Name != gpu.Name { - t.Errorf("expected name %q, got %q", gpu.Name, created.Name) - } - if created.Spec.UUID != gpu.Spec.UUID { - t.Errorf("expected UUID %q, got %q", gpu.Spec.UUID, created.Spec.UUID) - } - - // Data integrity: Status - if len(created.Status.Conditions) != len(gpu.Status.Conditions) { - t.Fatalf("expected %d conditions, got %d", len(gpu.Status.Conditions), len(created.Status.Conditions)) - } - - cond := created.Status.Conditions[0] - expected := gpu.Status.Conditions[0] - - if cond.Type != expected.Type { - t.Errorf("expected condition Type %q, got %q", expected.Type, cond.Type) - } - if cond.Status != expected.Status { - t.Errorf("expected condition Status %q, got %q", expected.Status, cond.Status) - } - if cond.Reason != expected.Reason { - t.Errorf("expected condition Reason %q, got %q", expected.Reason, cond.Reason) - } - if cond.Message != expected.Message { - t.Errorf("expected condition Message %q, got %q", expected.Message, cond.Message) - } - if cond.LastTransitionTime.IsZero() { - t.Error("condition LastTransitionTime should not be zero") - } - - // TODO: remove - objJson, _ := json.MarshalIndent(created, "", " ") - fmt.Printf("\n--- [Object After Creation] ---\n%s\n", string(objJson)) - }) - - t.Run("Update", func(t *testing.T) { - if created == nil { - t.Skip("Skipping: Create failed") - } - - toUpdate := created.DeepCopy() - toUpdate.Spec.UUID = "GPU-cd2367dd-a40e-6b86-6fc3-c44a2cc92c7d" - - updated, err := client.DeviceV1alpha1().GPUs().Update(ctx, toUpdate, metav1.UpdateOptions{}) - if err != nil { - t.Fatalf("Failed to update GPU: %v", err) - } - - if updated.Spec.UUID != toUpdate.Spec.UUID { - t.Errorf("expected UUID %q, got %q", toUpdate.Spec.UUID, updated.Spec.UUID) - } - - oldRV, _ := strconv.ParseInt(created.ResourceVersion, 10, 64) - updatedRV, _ := strconv.ParseInt(updated.ResourceVersion, 10, 64) - - if updatedRV <= oldRV { - t.Errorf("expected ResourceVersion to increase, got %d (old) and %d (new)", oldRV, updatedRV) - } - - if updated.Generation <= created.Generation { - t.Errorf("expected Generation to increase, got %d (old) and %d (new)", created.Generation, updated.Generation) - } - - // TODO: remove - objJson, _ := json.MarshalIndent(updated, "", " ") - fmt.Printf("\n--- [Object After Update] ---\n%s\n", string(objJson)) - }) - - // TODO: add tests for Delete, List, Watch -}