From 8269f86dc67c94a4f46f9d854003a93198fc7f87 Mon Sep 17 00:00:00 2001 From: Elliott Davis Date: Sat, 22 Nov 2025 18:50:52 -0600 Subject: [PATCH] Add vGPU license visibility and condition management - Updated README to document new vGPU license visibility features, including annotations for license status and a condition in ClusterPolicy. - Introduced license collection logic in `license_collector.go` and corresponding tests in `license_collector_test.go`. - Implemented metrics for license status updates in `metrics_license.go` and integrated it into the NodeMetrics workflow. - Added condition management for vGPU licensing in `license_condition.go`, with tests to validate behavior in various licensing scenarios. - Enhanced permissions in `0300_clusterrole.yaml` to allow patching of node annotations. This update improves visibility and management of vGPU licensing, facilitating easier diagnostics and compliance monitoring. Closes: #1477 Signed-off-by: Elliott Davis --- README.md | 20 ++ .../0300_clusterrole.yaml | 1 + cmd/nvidia-validator/license_collector.go | 199 +++++++++++++++ .../license_collector_test.go | 32 +++ cmd/nvidia-validator/metrics.go | 3 + cmd/nvidia-validator/metrics_license.go | 131 ++++++++++ controllers/clusterpolicy_controller.go | 4 + controllers/license_condition.go | 241 ++++++++++++++++++ controllers/license_condition_test.go | 80 ++++++ internal/conditions/conditions.go | 2 + internal/conditions/consts.go | 13 + internal/licenseinfo/snapshot.go | 76 ++++++ 12 files changed, 802 insertions(+) create mode 100644 cmd/nvidia-validator/license_collector.go create mode 100644 cmd/nvidia-validator/license_collector_test.go create mode 100644 cmd/nvidia-validator/metrics_license.go create mode 100644 controllers/license_condition.go create mode 100644 controllers/license_condition_test.go create mode 100644 internal/licenseinfo/snapshot.go diff --git a/README.md b/README.md index 4a166cc99..182e071c8 100644 --- a/README.md +++ b/README.md @@ -25,3 +25,23 @@ For information on platform support and getting started, visit the official docu ## Support and Getting Help Please open [an issue on the GitHub project](https://github.com/NVIDIA/gpu-operator/issues/new) for any questions. Your feedback is appreciated. + +## vGPU License Visibility +When the operator configures a node for `vm-vgpu` workloads it now reports the license state directly through the Kubernetes API: + +- Each vGPU node receives an annotation `nvidia.com/vgpu-license-statuses` that contains a JSON snapshot of the most recent `nvidia-smi vgpu -q` output, including per-device status and expiry timestamps. +- The `ClusterPolicy` resource exposes a `Licensed` condition that summarizes the state of every vGPU node. It turns `False` if any device is unlicensed or nearing expiry, and `Unknown` if data from the node-status-exporter is missing. + +You can inspect the node-level data with: + +```bash +kubectl get node -o jsonpath='{.metadata.annotations.nvidia\.com/vgpu-license-statuses}' +``` + +and the cluster-level summary through: + +```bash +kubectl get clusterpolicy gpu-cluster-policy -o jsonpath='{.status.conditions[?(@.type=="Licensed")]}' +``` + +This makes it easier for users and automation to diagnose misconfigured or expired licenses without shelling into the node. diff --git a/assets/state-node-status-exporter/0300_clusterrole.yaml b/assets/state-node-status-exporter/0300_clusterrole.yaml index 6f91fe237..9f121e4a7 100644 --- a/assets/state-node-status-exporter/0300_clusterrole.yaml +++ b/assets/state-node-status-exporter/0300_clusterrole.yaml @@ -11,3 +11,4 @@ rules: - get - list - watch + - patch diff --git a/cmd/nvidia-validator/license_collector.go b/cmd/nvidia-validator/license_collector.go new file mode 100644 index 000000000..0fbaf9b4a --- /dev/null +++ b/cmd/nvidia-validator/license_collector.go @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "bufio" + "bytes" + "context" + "fmt" + "os/exec" + "regexp" + "strings" + "time" + + "github.com/NVIDIA/gpu-operator/internal/licenseinfo" +) + +const ( + licenseQueryCommand = "nvidia-smi" + licenseQueryArgs = "vgpu -q" + licenseSource = "nvidia-smi vgpu -q" +) + +var ( + gpuHeaderRegexp = regexp.MustCompile(`^GPU\s+([0-9A-Fa-fx:.]+)`) + keyValueRegexp = regexp.MustCompile(`^([^:]+):\s*(.*)$`) + expiryMarker = "Expiry:" + licenseTimeParse = []string{ + "2006-1-2 15:04:05 MST", + "2006-01-02 15:04:05 MST", + "2006-1-2 15:04:05", + time.RFC3339, + time.RFC1123Z, + time.RFC1123, + } +) + +// collectLicenseSnapshot runs nvidia-smi and parses vGPU license information. +// Errors are propagated and also captured in the returned snapshot so that callers +// can still surface diagnostic data to the cluster. +func collectLicenseSnapshot(ctx context.Context, now time.Time) (licenseinfo.Snapshot, error) { + snapshot := licenseinfo.NewSnapshot(nil, licenseSource, now) + + output, err := runLicenseQuery(ctx) + if err != nil { + snapshot.Error = err.Error() + return snapshot, err + } + + devices, parseErr := parseVGPULicenseOutput(output) + snapshot.Devices = devices + if parseErr != nil { + snapshot.Error = parseErr.Error() + return snapshot, parseErr + } + if len(devices) == 0 { + err := fmt.Errorf("no vGPU license information found in nvidia-smi output") + snapshot.Error = err.Error() + return snapshot, err + } + + return snapshot, nil +} + +func runLicenseQuery(ctx context.Context) (string, error) { + args := strings.Split(licenseQueryArgs, " ") + cmd := exec.CommandContext(ctx, licenseQueryCommand, args...) + var combined bytes.Buffer + cmd.Stdout = &combined + cmd.Stderr = &combined + if err := cmd.Run(); err != nil { + return "", fmt.Errorf("%s failed: %w: %s", licenseSource, err, strings.TrimSpace(combined.String())) + } + return combined.String(), nil +} + +func parseVGPULicenseOutput(output string) ([]licenseinfo.DeviceStatus, error) { + scanner := bufio.NewScanner(strings.NewReader(output)) + var ( + devices []licenseinfo.DeviceStatus + current *licenseinfo.DeviceStatus + ) + + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + + if matches := gpuHeaderRegexp.FindStringSubmatch(line); len(matches) == 2 { + if current != nil { + devices = append(devices, *current) + } + current = &licenseinfo.DeviceStatus{ID: matches[1]} + continue + } + + if current == nil { + continue + } + + key, value := parseKeyValue(line) + switch key { + case "Product Name": + current.Product = value + case "License Status": + status, expiryCandidate := extractStatusAndExpiry(value) + current.Status = status + current.Licensed = isStatusLicensed(status) + if expiryCandidate != "" && current.Expiry == nil { + if ts, err := parseLicenseTimestamp(expiryCandidate); err == nil { + current.Expiry = &ts + } else if current.Message == "" { + current.Message = fmt.Sprintf("license expiry: %s", expiryCandidate) + } + } + case "License Expiry": + if ts, err := parseLicenseTimestamp(value); err == nil { + current.Expiry = &ts + } else if current.Message == "" { + current.Message = fmt.Sprintf("license expiry: %s", value) + } + case "vGPU Software Licensed": + if !isAffirmative(value) { + current.Licensed = false + } + } + } + + if current != nil { + devices = append(devices, *current) + } + + if err := scanner.Err(); err != nil { + return devices, fmt.Errorf("failed to parse license output: %w", err) + } + return devices, nil +} + +func parseKeyValue(line string) (string, string) { + matches := keyValueRegexp.FindStringSubmatch(line) + if len(matches) != 3 { + return line, "" + } + return strings.TrimSpace(matches[1]), strings.TrimSpace(matches[2]) +} + +func extractStatusAndExpiry(value string) (string, string) { + idx := strings.Index(value, expiryMarker) + if idx == -1 { + return value, "" + } + status := strings.TrimSpace(strings.Trim(value[:idx], "()")) + expiry := strings.TrimSpace(strings.Trim(value[idx+len(expiryMarker):], "()")) + return status, expiry +} + +func isStatusLicensed(status string) bool { + if status == "" { + return false + } + lower := strings.ToLower(status) + if strings.Contains(lower, "unlicensed") || strings.Contains(lower, "not licensed") || strings.Contains(lower, "expired") { + return false + } + return strings.Contains(lower, "licensed") +} + +func isAffirmative(value string) bool { + lower := strings.ToLower(value) + return lower == "yes" || lower == "true" +} + +func parseLicenseTimestamp(value string) (time.Time, error) { + val := strings.TrimSpace(value) + if val == "" || strings.EqualFold(val, "n/a") { + return time.Time{}, fmt.Errorf("empty expiry") + } + for _, layout := range licenseTimeParse { + if ts, err := time.Parse(layout, val); err == nil { + return ts.UTC(), nil + } + } + return time.Time{}, fmt.Errorf("unsupported expiry format: %s", value) +} diff --git a/cmd/nvidia-validator/license_collector_test.go b/cmd/nvidia-validator/license_collector_test.go new file mode 100644 index 000000000..665bd8743 --- /dev/null +++ b/cmd/nvidia-validator/license_collector_test.go @@ -0,0 +1,32 @@ +package main + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestParseVGPULicenseOutput(t *testing.T) { + sample := ` +GPU 00000000:02:00.0 + Product Name : NVIDIA A16 + License Status : Licensed (Expiry: 2025-6-26 21:46:51 GMT) + License Expiry : 2025-6-26 21:46:51 GMT + +GPU 00000000:82:00.0 + Product Name : NVIDIA A16 + License Status : Unlicensed + License Expiry : N/A +` + devices, err := parseVGPULicenseOutput(sample) + require.NoError(t, err) + require.Len(t, devices, 2) + require.Equal(t, "00000000:02:00.0", devices[0].ID) + require.True(t, devices[0].Licensed) + require.NotNil(t, devices[0].Expiry) + require.Equal(t, "Licensed", devices[0].Status) + + require.Equal(t, "00000000:82:00.0", devices[1].ID) + require.False(t, devices[1].Licensed) + require.Nil(t, devices[1].Expiry) +} diff --git a/cmd/nvidia-validator/metrics.go b/cmd/nvidia-validator/metrics.go index 4105dd166..78d2709da 100644 --- a/cmd/nvidia-validator/metrics.go +++ b/cmd/nvidia-validator/metrics.go @@ -43,6 +43,8 @@ const ( driverValidationCheckDelaySeconds = 60 // pluginValidationCheckDelaySeconds indicates the delay between two checks of the device plugin validation, in seconds pluginValidationCheckDelaySeconds = 30 + // licenseStatusCheckDelaySeconds indicates how often license annotations are refreshed. + licenseStatusCheckDelaySeconds = 60 ) // NodeMetrics contains the port of the metrics server and the @@ -308,6 +310,7 @@ func (nm *NodeMetrics) Run() error { go nm.watchDriverValidation() go nm.watchDevicePluginValidation() go nm.watchNVIDIAPCI() + go nm.watchLicenseAnnotations() log.Printf("Running the metrics server, listening on :%d/metrics", nm.port) http.Handle("/metrics", promhttp.Handler()) diff --git a/cmd/nvidia-validator/metrics_license.go b/cmd/nvidia-validator/metrics_license.go new file mode 100644 index 000000000..59e9ec910 --- /dev/null +++ b/cmd/nvidia-validator/metrics_license.go @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/NVIDIA/gpu-operator/internal/licenseinfo" + log "github.com/sirupsen/logrus" + meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +func (nm *NodeMetrics) watchLicenseAnnotations() { + kubeConfig, err := rest.InClusterConfig() + if err != nil { + log.Errorf("metrics: License: error getting cluster config - %v", err) + return + } + + kubeClient, err := kubernetes.NewForConfig(kubeConfig) + if err != nil { + log.Errorf("metrics: License: error getting k8s client - %v", err) + return + } + + log.Infof("metrics: License: starting publisher for annotation %s", licenseinfo.AnnotationKey) + ticker := time.NewTicker(licenseStatusCheckDelaySeconds * time.Second) + defer ticker.Stop() + + var lastPayload string + for { + select { + case <-nm.ctx.Done(): + log.Info("metrics: License: context cancelled, stopping publisher") + return + default: + } + + payload, buildErr := nm.buildLicenseAnnotation(kubeClient) + if buildErr != nil { + log.Warningf("metrics: License: %v", buildErr) + } + + if payload == lastPayload { + <-ticker.C + continue + } + + if err := patchLicenseAnnotation(nm.ctx, kubeClient, payload); err != nil { + log.Errorf("metrics: License: failed to update node annotation: %v", err) + } else { + lastPayload = payload + } + + <-ticker.C + } +} + +func (nm *NodeMetrics) buildLicenseAnnotation(kubeClient kubernetes.Interface) (string, error) { + node, err := getNode(nm.ctx, kubeClient) + if err != nil { + return "", fmt.Errorf("unable to fetch node %s: %w", nodeNameFlag, err) + } + + workloadConfig := node.GetLabels()[gpuWorkloadConfigLabelKey] + if workloadConfig != gpuWorkloadConfigVMVgpu { + return "", nil + } + + now := time.Now() + snapshot, snapErr := collectLicenseSnapshot(nm.ctx, now) + payload, marshalErr := snapshot.Marshal() + if marshalErr != nil { + return "", marshalErr + } + + if snapErr != nil { + return payload, snapErr + } + return payload, nil +} + +func patchLicenseAnnotation(ctx context.Context, kubeClient kubernetes.Interface, serializedSnapshot string) error { + var annotations map[string]interface{} + if serializedSnapshot == "" { + annotations = map[string]interface{}{ + licenseinfo.AnnotationKey: nil, + } + } else { + annotations = map[string]interface{}{ + licenseinfo.AnnotationKey: serializedSnapshot, + } + } + + patch := map[string]interface{}{ + "metadata": map[string]interface{}{ + "annotations": annotations, + }, + } + + patchBytes, err := json.Marshal(patch) + if err != nil { + return fmt.Errorf("failed to marshal annotation patch: %w", err) + } + + _, err = kubeClient.CoreV1().Nodes().Patch(ctx, nodeNameFlag, types.MergePatchType, patchBytes, meta_v1.PatchOptions{}) + if err != nil { + return fmt.Errorf("failed to patch node %s: %w", nodeNameFlag, err) + } + return nil +} diff --git a/controllers/clusterpolicy_controller.go b/controllers/clusterpolicy_controller.go index d16d2d445..39382c283 100644 --- a/controllers/clusterpolicy_controller.go +++ b/controllers/clusterpolicy_controller.go @@ -170,6 +170,10 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques } } + if err := r.syncLicenseCondition(ctx, instance); err != nil { + r.Log.Error(err, "unable to update license condition") + } + // if any state is not ready, requeue for reconcile after 5 seconds if overallStatus != gpuv1.Ready { clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady) diff --git a/controllers/license_condition.go b/controllers/license_condition.go new file mode 100644 index 000000000..edb18e63a --- /dev/null +++ b/controllers/license_condition.go @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controllers + +import ( + "context" + "fmt" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" + "sigs.k8s.io/controller-runtime/pkg/client" + + gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" + "github.com/NVIDIA/gpu-operator/internal/conditions" + "github.com/NVIDIA/gpu-operator/internal/licenseinfo" +) + +const ( + licenseExpiryWarningWindow = 14 * 24 * time.Hour + licenseIssueLimit = 3 +) + +type nodeLicenseSnapshot struct { + node string + annotationPresent bool + snapshot *licenseinfo.Snapshot + err error +} + +func (r *ClusterPolicyReconciler) syncLicenseCondition(ctx context.Context, instance *gpuv1.ClusterPolicy) error { + condition, err := r.buildLicenseCondition(ctx, instance) + if condition == nil { + return err + } + if setErr := r.setCustomCondition(ctx, instance, *condition); setErr != nil { + return setErr + } + return err +} + +func (r *ClusterPolicyReconciler) buildLicenseCondition(ctx context.Context, instance *gpuv1.ClusterPolicy) (*metav1.Condition, error) { + nodes := &corev1.NodeList{} + opts := []client.ListOption{ + client.MatchingLabels{ + commonGPULabelKey: commonGPULabelValue, + gpuWorkloadConfigLabelKey: gpuWorkloadConfigVMVgpu, + }, + } + + if err := r.Client.List(ctx, nodes, opts...); err != nil { + cond := &metav1.Condition{ + Type: conditions.Licensed, + Status: metav1.ConditionUnknown, + Reason: conditions.LicenseCollectionFailed, + Message: fmt.Sprintf("failed to list vGPU nodes: %v", err), + } + return cond, err + } + + if len(nodes.Items) == 0 { + return &metav1.Condition{ + Type: conditions.Licensed, + Status: metav1.ConditionTrue, + Reason: conditions.LicenseNotRequired, + Message: "No vm-vgpu nodes detected; licensing not required", + }, nil + } + + snapshots := make([]nodeLicenseSnapshot, 0, len(nodes.Items)) + for _, node := range nodes.Items { + annotation := "" + if node.Annotations != nil { + annotation = node.Annotations[licenseinfo.AnnotationKey] + } + snap := nodeLicenseSnapshot{ + node: node.Name, + annotationPresent: annotation != "", + } + if annotation == "" { + snap.err = fmt.Errorf("license annotation missing") + } else { + parsed, err := licenseinfo.Parse(annotation) + if err != nil { + snap.err = fmt.Errorf("%w", err) + } else { + if parsed.Error != "" { + snap.err = fmt.Errorf("%s", parsed.Error) + } + snap.snapshot = &parsed + } + } + snapshots = append(snapshots, snap) + } + + cond := summarizeLicenseSnapshots(snapshots, time.Now()) + cond.Type = conditions.Licensed + return &cond, nil +} + +func (r *ClusterPolicyReconciler) setCustomCondition(ctx context.Context, instance *gpuv1.ClusterPolicy, condition metav1.Condition) error { + return retry.RetryOnConflict(retry.DefaultBackoff, func() error { + current := &gpuv1.ClusterPolicy{} + if err := r.Get(ctx, types.NamespacedName{Name: instance.Name}, current); err != nil { + return err + } + condition.ObservedGeneration = current.Generation + meta.SetStatusCondition(¤t.Status.Conditions, condition) + return r.Client.Status().Update(ctx, current) + }) +} + +func summarizeLicenseSnapshots(snapshots []nodeLicenseSnapshot, now time.Time) metav1.Condition { + var ( + unlicensed []string + expiringSoon []string + dataIssues []string + licensedCount int + reportedNodes int + earliestExpiry *time.Time + ) + + for _, snap := range snapshots { + if !snap.annotationPresent { + dataIssues = append(dataIssues, fmt.Sprintf("%s: annotation missing", snap.node)) + continue + } + if snap.snapshot == nil { + dataIssues = append(dataIssues, fmt.Sprintf("%s: %v", snap.node, snap.err)) + continue + } + reportedNodes++ + + if snap.err != nil { + dataIssues = append(dataIssues, fmt.Sprintf("%s: %v", snap.node, snap.err)) + } + + if len(snap.snapshot.Devices) == 0 { + dataIssues = append(dataIssues, fmt.Sprintf("%s: no vGPU devices reported", snap.node)) + continue + } + + for _, device := range snap.snapshot.Devices { + descriptor := formatDeviceDescriptor(snap.node, device) + if !device.Licensed { + unlicensed = append(unlicensed, fmt.Sprintf("%s (%s)", descriptor, safeStatus(device.Status))) + continue + } + if device.Expiry != nil { + expiry := device.Expiry.UTC() + if earliestExpiry == nil || expiry.Before(*earliestExpiry) { + exp := expiry + earliestExpiry = &exp + } + if expiry.Before(now) { + unlicensed = append(unlicensed, fmt.Sprintf("%s (expired %s)", descriptor, expiry.Format(time.RFC3339))) + continue + } + if expiry.Sub(now) <= licenseExpiryWarningWindow { + expiringSoon = append(expiringSoon, fmt.Sprintf("%s (expires %s)", descriptor, expiry.Format(time.RFC3339))) + continue + } + } + licensedCount++ + } + } + + switch { + case len(unlicensed) > 0: + return metav1.Condition{ + Status: metav1.ConditionFalse, + Reason: conditions.LicenseNotReady, + Message: summarizeIssues("Unlicensed or expired vGPU devices", unlicensed), + } + case len(expiringSoon) > 0: + return metav1.Condition{ + Status: metav1.ConditionFalse, + Reason: conditions.LicenseExpiringSoon, + Message: summarizeIssues("Licenses expiring soon", expiringSoon), + } + case len(dataIssues) > 0: + return metav1.Condition{ + Status: metav1.ConditionUnknown, + Reason: conditions.LicenseInfoMissing, + Message: summarizeIssues("Incomplete license information", dataIssues), + } + default: + message := fmt.Sprintf("All %d vGPU device(s) on %d node(s) are licensed", licensedCount, reportedNodes) + if earliestExpiry != nil { + message = fmt.Sprintf("%s (next expiry %s)", message, earliestExpiry.Format(time.RFC3339)) + } + return metav1.Condition{ + Status: metav1.ConditionTrue, + Reason: conditions.LicenseOK, + Message: message, + } + } +} + +func formatDeviceDescriptor(node string, device licenseinfo.DeviceStatus) string { + if device.Product != "" { + return fmt.Sprintf("%s/%s (%s)", node, device.ID, device.Product) + } + return fmt.Sprintf("%s/%s", node, device.ID) +} + +func safeStatus(status string) string { + if status == "" { + return "status unknown" + } + return status +} + +func summarizeIssues(prefix string, entries []string) string { + if len(entries) == 0 { + return prefix + } + if len(entries) > licenseIssueLimit { + shown := entries[:licenseIssueLimit] + return fmt.Sprintf("%s: %s (and %d more)", prefix, strings.Join(shown, "; "), len(entries)-licenseIssueLimit) + } + return fmt.Sprintf("%s: %s", prefix, strings.Join(entries, "; ")) +} diff --git a/controllers/license_condition_test.go b/controllers/license_condition_test.go new file mode 100644 index 000000000..9c22d91fa --- /dev/null +++ b/controllers/license_condition_test.go @@ -0,0 +1,80 @@ +package controllers + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/NVIDIA/gpu-operator/internal/conditions" + "github.com/NVIDIA/gpu-operator/internal/licenseinfo" +) + +func TestSummarizeLicenseSnapshots_AllLicensed(t *testing.T) { + now := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) + expiry := now.Add(30 * 24 * time.Hour) + snapshots := []nodeLicenseSnapshot{ + { + node: "node-a", + annotationPresent: true, + snapshot: &licenseinfo.Snapshot{ + Devices: []licenseinfo.DeviceStatus{ + {ID: "0000:00:01.0", Licensed: true, Expiry: &expiry}, + }, + }, + }, + } + + cond := summarizeLicenseSnapshots(snapshots, now) + require.Equal(t, metav1.ConditionTrue, cond.Status) + require.Equal(t, conditions.LicenseOK, cond.Reason) + require.Contains(t, cond.Message, "All 1 vGPU device(s)") +} + +func TestSummarizeLicenseSnapshots_ExpiringSoon(t *testing.T) { + now := time.Now() + expiry := now.Add(12 * time.Hour) + snapshots := []nodeLicenseSnapshot{ + { + node: "node-b", + annotationPresent: true, + snapshot: &licenseinfo.Snapshot{ + Devices: []licenseinfo.DeviceStatus{ + {ID: "0000:00:02.0", Licensed: true, Expiry: &expiry}, + }, + }, + }, + } + + cond := summarizeLicenseSnapshots(snapshots, now) + require.Equal(t, metav1.ConditionFalse, cond.Status) + require.Equal(t, conditions.LicenseExpiringSoon, cond.Reason) +} + +func TestSummarizeLicenseSnapshots_Unlicensed(t *testing.T) { + now := time.Now() + snapshots := []nodeLicenseSnapshot{ + { + node: "node-c", + annotationPresent: true, + snapshot: &licenseinfo.Snapshot{ + Devices: []licenseinfo.DeviceStatus{ + {ID: "0000:00:03.0", Licensed: false, Status: "Unlicensed"}, + }, + }, + }, + } + + cond := summarizeLicenseSnapshots(snapshots, now) + require.Equal(t, metav1.ConditionFalse, cond.Status) + require.Equal(t, conditions.LicenseNotReady, cond.Reason) +} + +func TestSummarizeLicenseSnapshots_MissingAnnotation(t *testing.T) { + cond := summarizeLicenseSnapshots([]nodeLicenseSnapshot{ + {node: "node-d", annotationPresent: false}, + }, time.Now()) + require.Equal(t, metav1.ConditionUnknown, cond.Status) + require.Equal(t, conditions.LicenseInfoMissing, cond.Reason) +} diff --git a/internal/conditions/conditions.go b/internal/conditions/conditions.go index 072a6589a..27b7c52c6 100644 --- a/internal/conditions/conditions.go +++ b/internal/conditions/conditions.go @@ -25,6 +25,8 @@ const ( Ready = "Ready" // Error condition type indicates one or more of the resources managed by the controller are in error state Error = "Error" + // Licensed indicates the aggregated vGPU licensing condition for the cluster. + Licensed = "Licensed" ) // Updater interface diff --git a/internal/conditions/consts.go b/internal/conditions/consts.go index 78e37d774..05083b717 100644 --- a/internal/conditions/consts.go +++ b/internal/conditions/consts.go @@ -32,4 +32,17 @@ const ( OperandNotReady = "OperandNotReady" // DriverNotReady indicates that the driver daemonset pods are not ready DriverNotReady = "DriverNotReady" + + // LicenseOK indicates all vGPU devices report healthy license information. + LicenseOK = "LicenseOK" + // LicenseNotRequired indicates that no nodes require vGPU licensing. + LicenseNotRequired = "LicenseNotRequired" + // LicenseInfoMissing indicates that license annotations are missing or stale. + LicenseInfoMissing = "LicenseInfoMissing" + // LicenseExpiringSoon indicates that at least one license is near expiry. + LicenseExpiringSoon = "LicenseExpiringSoon" + // LicenseNotReady indicates an expired or unlicensed vGPU. + LicenseNotReady = "LicenseNotReady" + // LicenseCollectionFailed indicates the operator could not evaluate license information. + LicenseCollectionFailed = "LicenseCollectionFailed" ) diff --git a/internal/licenseinfo/snapshot.go b/internal/licenseinfo/snapshot.go new file mode 100644 index 000000000..e312250fd --- /dev/null +++ b/internal/licenseinfo/snapshot.go @@ -0,0 +1,76 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package licenseinfo + +import ( + "encoding/json" + "fmt" + "time" +) + +const ( + // AnnotationKey is the node annotation that stores serialized vGPU license data. + AnnotationKey = "nvidia.com/vgpu-license-statuses" +) + +// DeviceStatus represents the licensing status for a single GPU/vGPU device. +type DeviceStatus struct { + ID string `json:"id"` + Product string `json:"product,omitempty"` + Licensed bool `json:"licensed"` + Status string `json:"status,omitempty"` + Expiry *time.Time `json:"expiry,omitempty"` + Message string `json:"message,omitempty"` +} + +// Snapshot captures the licensing state observed on a node at a specific time. +type Snapshot struct { + CollectedAt time.Time `json:"collectedAt"` + Source string `json:"source,omitempty"` + Error string `json:"error,omitempty"` + Devices []DeviceStatus `json:"devices,omitempty"` +} + +// NewSnapshot initializes a Snapshot with the provided metadata. CollectedAt is normalized to UTC. +func NewSnapshot(devices []DeviceStatus, source string, collectedAt time.Time) Snapshot { + return Snapshot{ + Devices: devices, + Source: source, + CollectedAt: collectedAt.UTC(), + } +} + +// Marshal converts the snapshot into a JSON string for use in annotations. +func (s Snapshot) Marshal() (string, error) { + out, err := json.Marshal(s) + if err != nil { + return "", fmt.Errorf("failed to marshal license snapshot: %w", err) + } + return string(out), nil +} + +// Parse converts the serialized snapshot back into a Snapshot struct. +func Parse(value string) (Snapshot, error) { + if value == "" { + return Snapshot{}, fmt.Errorf("empty license snapshot") + } + var snap Snapshot + if err := json.Unmarshal([]byte(value), &snap); err != nil { + return Snapshot{}, fmt.Errorf("unable to parse license snapshot: %w", err) + } + return snap, nil +}