diff --git a/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-3.x.yaml b/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-3.x.yaml index 60af3a000..107f12a86 100644 --- a/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-3.x.yaml +++ b/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-3.x.yaml @@ -60,6 +60,8 @@ spec: - {{ .Values.global.metadataPath | quote }} - --processing-strategy - {{ .Values.processingStrategy | quote }} + - --gpu-count-check-enabled + - {{ .Values.gpuCountCheck.enabled | quote }} securityContext: runAsUser: 0 image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}-dcgm-3.x" diff --git a/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-4.x.yaml b/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-4.x.yaml index cdc6f935f..c723f05f2 100644 --- a/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-4.x.yaml +++ b/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-4.x.yaml @@ -60,6 +60,8 @@ spec: - {{ .Values.global.metadataPath | quote }} - --processing-strategy - {{ .Values.processingStrategy | quote }} + - --gpu-count-check-enabled + - {{ .Values.gpuCountCheck.enabled | quote }} securityContext: runAsUser: 0 image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}-dcgm-4.x" diff --git a/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/values.yaml b/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/values.yaml index 9f5ee7588..2426327f2 100644 --- a/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/values.yaml +++ b/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/values.yaml @@ -52,6 +52,12 @@ tolerations: [] podAnnotations: {} +# GPU count check configuration +# Compares PCIe bus GPU count (sysfs) vs driver-visible GPU count (DCGM) +# and sends a health event on mismatch +gpuCountCheck: + enabled: true + verbose: "False" # Processing strategy for health events diff --git a/docs/gpu-health-monitor.md b/docs/gpu-health-monitor.md index 79a5e30b6..248716990 100644 --- a/docs/gpu-health-monitor.md +++ b/docs/gpu-health-monitor.md @@ -70,11 +70,46 @@ The monitor checks multiple GPU health aspects through DCGM. Below are some of t > **Note**: The specific health watches available depend on your DCGM version and GPU model. NVIDIA regularly adds new health checks and monitoring capabilities to DCGM. Consult your DCGM documentation for the complete list of supported health watches for your environment. +### GPU Count Mismatch Detection (GpuTotalCountMismatch) + +#### Why Is This Needed? + +DCGM health watches are powerful but they can only monitor GPUs that the NVIDIA driver has already initialized. If a GPU fails during driver initialization at boot, or silently falls off the PCIe bus at runtime without generating an XID error, DCGM simply never sees it -- the GPU vanishes from the software view with no health event generated. This creates a blind spot: a node can be running with fewer GPUs than expected and no existing check raises an alarm. + +The GPU Count Mismatch check closes this gap by comparing the **hardware-level** GPU count (from the Linux sysfs PCIe device tree) against the **software-level** GPU count (from DCGM). Because sysfs reflects what the PCIe bus enumerates at boot time, it sees GPUs that the NVIDIA driver may have failed to initialize. Any discrepancy between these two counts indicates a GPU that is physically present on the PCIe bus but not usable by workloads. + +#### Coverage + +The table below illustrates which failure scenarios are caught by existing syslog-health-check versus this GpuTotalCountMismatch: + +| Scenario | Xid79/GpuFallenOffBus | GpuTotalCountMismatch | +|-------------------------------------------------|-----------------------------|------------------------------------| +| GPU fails driver init at boot (no XID) | Miss | Catches (sysfs > DCGM) | +| GPU falls off at runtime (no XID logged) | Miss | Catches (sysfs cached > DCGM live) | +| GPU falls off at runtime (XID 79 logged) | SysLogsGPUFallenOff catches | Also catches | +| GPU physically dead at boot (no PCIe response) | Miss | Miss (invisible to both) | + +#### How It Works + +1. **At startup**, the monitor scans sysfs (`/sys/bus/pci/devices/`) for all NVIDIA GPU PCI addresses (vendor `0x10de`, class prefix `0x0302`). This count is cached because sysfs reflects the boot-time PCIe enumeration and does not change during a boot session. +2. **Every polling cycle**, after a successful DCGM health check, the monitor compares the cached sysfs GPU count against the live DCGM GPU count. +3. **If a mismatch is detected** (sysfs count != DCGM count), a fatal `GpuTotalCountMismatch` health event is sent. The event message includes the specific PCI addresses of GPUs that are on the PCIe bus but missing from the driver, making diagnosis straightforward. +4. **If the counts match** (or a previous mismatch resolves), a healthy event is sent to clear any prior mismatch alert. + +State-change deduplication ensures that only transitions (healthy to unhealthy or vice versa) generate events, preventing event flooding during repeated polling cycles. + +#### Configuration + +The GPU count check is enabled by default. It can be disabled via the `gpu_count_check_enabled` parameter if your environment has a known reason for GPU count differences (e.g., intentionally partitioned GPUs). + ## Key Features ### DCGM Integration Leverages NVIDIA's Data Center GPU Manager for comprehensive GPU health monitoring with proven reliability. +### GPU Count Mismatch Detection +Compares hardware-level PCIe GPU enumeration (sysfs) against DCGM's software-visible GPU count to catch GPUs that fail driver initialization or silently drop off the bus -- scenarios that DCGM health watches alone cannot detect. + ### Automatic State Management Maintains entity-level cache to track reported issues and avoid sending duplicate events within a single boot session. diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/cli.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/cli.py index f63659846..f84fa6d36 100644 --- a/health-monitors/gpu-health-monitor/gpu_health_monitor/cli.py +++ b/health-monitors/gpu-health-monitor/gpu_health_monitor/cli.py @@ -78,6 +78,13 @@ def _init_event_processor( help="Event processing strategy: EXECUTE_REMEDIATION or STORE_ONLY", required=False, ) +@click.option( + "--gpu-count-check-enabled", + type=bool, + default=True, + help="Enable GPU count check comparing PCIe bus vs driver-visible GPUs", + required=False, +) def cli( dcgm_addr, dcgm_error_mapping_config_file, @@ -88,6 +95,7 @@ def cli( dcgm_k8s_service_enabled, metadata_path, processing_strategy, + gpu_count_check_enabled, ): exit = Event() config = configparser.ConfigParser() @@ -164,6 +172,7 @@ def process_exit_signal(signum, frame): poll_interval_seconds=int(dcgm_config["PollIntervalSeconds"]), callbacks=enabled_event_processors, dcgm_k8s_service_enabled=dcgm_k8s_service_enabled, + gpu_count_check_enabled=gpu_count_check_enabled, ) dcgm_watcher.start([], exit) diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/dcgm.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/dcgm.py index 7721b91b9..0130a3508 100644 --- a/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/dcgm.py +++ b/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/dcgm.py @@ -23,6 +23,8 @@ import time import os +from gpu_health_monitor.pcie import get_nvidia_gpu_pci_addresses + DELAY, MULTIPLIER, MAX_DELAY = 2, 1.5, 120 DCGM_4_PYTHON_PATH = "/usr/share/datacenter-gpu-manager-4/bindings/python3" @@ -34,6 +36,7 @@ def __init__( poll_interval_seconds: int, callbacks: list[types.CallbackInterface], dcgm_k8s_service_enabled: bool, + gpu_count_check_enabled: bool = True, ) -> None: self._addr = addr self._poll_interval_seconds = poll_interval_seconds @@ -48,6 +51,7 @@ def __init__( self._callback_thread_pool = ThreadPoolExecutor() self._dcgm_k8s_service_enabled = dcgm_k8s_service_enabled + self._gpu_count_check_enabled = gpu_count_check_enabled def _get_available_health_watches(self) -> dict[int, str]: health_watches = {} @@ -269,6 +273,23 @@ def start(self, fields_to_monitor: list[str], exit: Event) -> None: dcgm_group = None gpu_ids = [] + # Cache sysfs GPU addresses (static per boot) if GPU count check is enabled + sysfs_gpu_pci_addresses = [] + metrics.gpu_count_check_sysfs_scan_failure.set(0) + if self._gpu_count_check_enabled: + sysfs_gpu_pci_addresses = get_nvidia_gpu_pci_addresses() + if not sysfs_gpu_pci_addresses: + log.warning( + "Sysfs GPU enumeration returned 0 GPUs (/sys may not be mounted or " + "accessible). Disabling GPU count check to avoid false results." + ) + metrics.gpu_count_check_sysfs_scan_failure.set(1) + self._gpu_count_check_enabled = False + else: + log.info(f"Sysfs GPU count (PCIe hardware): {len(sysfs_gpu_pci_addresses)}") + else: + log.info("GPU count check is disabled") + # Initial DCGM handle and monitoring setup while not exit.is_set(): with metrics.overall_reconcile_loop_time.time(): @@ -300,6 +321,18 @@ def start(self, fields_to_monitor: list[str], exit: Event) -> None: [health_status, gpu_ids], ) + # GPU count check: compare sysfs (hardware) vs DCGM + if self._gpu_count_check_enabled: + if len(sysfs_gpu_pci_addresses) != len(gpu_ids): + log.warning( + f"GPU count mismatch detected: sysfs={len(sysfs_gpu_pci_addresses)}, " + f"DCGM={len(gpu_ids)}" + ) + self._fire_callback_funcs( + types.CallbackInterface.gpu_count_check_completed.__name__, + [sysfs_gpu_pci_addresses, gpu_ids], + ) + log.debug("Waiting till next cycle") exit.wait(self._poll_interval_seconds) diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/metrics.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/metrics.py index 4a28fccd7..5490d3b93 100644 --- a/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/metrics.py +++ b/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/metrics.py @@ -35,3 +35,7 @@ "Number of times an error has occurred", labelnames=["error_name"], ) +gpu_count_check_sysfs_scan_failure = Gauge( + "gpu_count_check_sysfs_scan_failure", + "Set to 1 when sysfs GPU enumeration returns zero GPUs, disabling the GPU count check", +) diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/types.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/types.py index 4201a6e3a..54e7a74cf 100644 --- a/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/types.py +++ b/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/types.py @@ -49,3 +49,12 @@ def health_event_occurred(self, health_details: dict[str, HealthDetails], gpu_id def dcgm_connectivity_failed(self): """Called when DCGM connectivity fails during health check.""" pass + + @abc.abstractmethod + def gpu_count_check_completed(self, sysfs_gpu_pci_addresses: list[str], dcgm_gpu_ids: list[int]) -> None: + """Called every cycle after health check with sysfs and DCGM GPU info. + + The implementation compares counts and handles mismatch detection + and resolution internally. + """ + pass diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/pcie/__init__.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/pcie/__init__.py new file mode 100644 index 000000000..e4e5714da --- /dev/null +++ b/health-monitors/gpu-health-monitor/gpu_health_monitor/pcie/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .gpu_counter import get_nvidia_gpu_pci_addresses + +__all__ = ["get_nvidia_gpu_pci_addresses"] diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/pcie/gpu_counter.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/pcie/gpu_counter.py new file mode 100644 index 000000000..9a65d869b --- /dev/null +++ b/health-monitors/gpu-health-monitor/gpu_health_monitor/pcie/gpu_counter.py @@ -0,0 +1,67 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import logging as log + +NVIDIA_VENDOR_ID = "0x10de" +GPU_CLASS_PREFIX = "0x0302" # GPU Class prefix for a100,h100 etc + + +def get_nvidia_gpu_pci_addresses(sysfs_base: str = "/sys") -> list[str]: + """Scan sysfs for NVIDIA GPU PCI addresses. + + Reads /sys/bus/pci/devices/*/vendor and /sys/bus/pci/devices/*/class + to find NVIDIA GPUs (vendor 0x10de, class prefix 0x0302). + + Args: + sysfs_base: Base path for sysfs. Default "/sys". Override for testing. + + Returns: + List of PCI addresses like ['0000:04:00.0', '0000:05:00.0', ...] + """ + devices_path = os.path.join(sysfs_base, "bus", "pci", "devices") + gpu_addresses = [] + + try: + entries = os.listdir(devices_path) + except OSError as e: + log.exception(f"Failed to read PCI devices directory {devices_path}") + return [] + + for entry in sorted(entries): + device_path = os.path.join(devices_path, entry) + + vendor = _read_sysfs_file(os.path.join(device_path, "vendor")) + if vendor != NVIDIA_VENDOR_ID: + continue + + device_class = _read_sysfs_file(os.path.join(device_path, "class")) + if not device_class or not device_class.startswith(GPU_CLASS_PREFIX): + continue + + gpu_addresses.append(entry) # entry is the PCI address like "0000:04:00.0" + log.debug(f"Found NVIDIA GPU on PCIe bus: {entry} (class={device_class})") + + log.info(f"Sysfs GPU enumeration complete: {len(gpu_addresses)} NVIDIA GPUs found") + return gpu_addresses + + +def _read_sysfs_file(path: str) -> str: + """Read and strip a sysfs file. Returns empty string on error.""" + try: + with open(path, "r") as f: + return f.read().strip() + except OSError: + return "" diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/platform_connector/platform_connector.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/platform_connector/platform_connector.py index 1e4ed8ef7..4823fd3ab 100644 --- a/health-monitors/gpu-health-monitor/gpu_health_monitor/platform_connector/platform_connector.py +++ b/health-monitors/gpu-health-monitor/gpu_health_monitor/platform_connector/platform_connector.py @@ -364,6 +364,117 @@ def send_health_event_with_retries(self, health_events: list[platformconnector_p ) return False + def _clear_gpu_count_mismatch(self, timestamp: Timestamp) -> None: + + check_name = "GpuTotalCountMismatch" + key = self._build_cache_key(check_name, "GPU_TOTAL_COUNT", "ALL") + + # Send if key is not in cache (first cycle) or if previous state was unhealthy + if key not in self.entity_cache or not self.entity_cache[key].isHealthy: + event_metadata = {} + chassis_serial = self._metadata_reader.get_chassis_serial() + if chassis_serial: + event_metadata["chassis_serial"] = chassis_serial + + health_event = platformconnector_pb2.HealthEvent( + version=self._version, + agent=self._agent, + componentClass=self._component_class, + checkName=check_name, + generatedTimestamp=timestamp, + isFatal=False, + isHealthy=True, + errorCode=[], + entitiesImpacted=[], + message="GpuTotalCountMismatch health event reported no errors", + recommendedAction=platformconnector_pb2.NONE, + nodeName=self._node_name, + metadata=event_metadata, + processingStrategy=self._processing_strategy, + ) + + try: + if self.send_health_event_with_retries([health_event]): + self.entity_cache[key] = CachedEntityState(isFatal=False, isHealthy=True) + log.info(f"Updated cache for key {key} after successful send") + except Exception as e: + log.exception(f"Exception while sending GPU count match resolved event: {e}") + + def gpu_count_check_completed(self, sysfs_gpu_pci_addresses: list[str], dcgm_gpu_ids: list[int]) -> None: + """Handle GPU count check: detect mismatch or clear previous mismatch.""" + with metrics.dcgm_health_events_publish_time_to_grpc_channel.labels("gpu_count_check_to_grpc_channel").time(): + timestamp = Timestamp() + timestamp.GetCurrentTime() + + if len(sysfs_gpu_pci_addresses) != len(dcgm_gpu_ids): + self._send_gpu_count_mismatch_event(timestamp, sysfs_gpu_pci_addresses, dcgm_gpu_ids) + else: + self._clear_gpu_count_mismatch(timestamp) + + def _send_gpu_count_mismatch_event( + self, + timestamp: Timestamp, + sysfs_gpu_pci_addresses: list[str], + dcgm_gpu_ids: list[int], + ) -> None: + """Send unhealthy health event for GPU count mismatch.""" + check_name = "GpuTotalCountMismatch" + key = self._build_cache_key(check_name, "GPU_TOTAL_COUNT", "ALL") + + # Only send if state changed (avoid flooding) + if key in self.entity_cache and not self.entity_cache[key].isHealthy: + return + + # Determine which PCI addresses are in sysfs but not in DCGM + dcgm_pci_addresses = set() + for gpu_id in dcgm_gpu_ids: + pci = self._metadata_reader.get_pci_address(gpu_id) + if pci: + dcgm_pci_addresses.add(pci) + + sysfs_pci_set = set(sysfs_gpu_pci_addresses) + missing_from_driver = sorted(sysfs_pci_set - dcgm_pci_addresses) + + entities_impacted = [] + for pci_addr in missing_from_driver: + entities_impacted.append(platformconnector_pb2.Entity(entityType="PCI", entityValue=pci_addr)) + + message = ( + f"GPU count mismatch: {len(sysfs_gpu_pci_addresses)} GPUs found on PCIe bus " + f"but {len(dcgm_gpu_ids)} visible to S/W. " + f"Missing from driver: [{', '.join(missing_from_driver)}]. " + f"Run nvidia-smi to verify the GPU count." + ) + + event_metadata = {} + chassis_serial = self._metadata_reader.get_chassis_serial() + if chassis_serial: + event_metadata["chassis_serial"] = chassis_serial + + health_event = platformconnector_pb2.HealthEvent( + version=self._version, + agent=self._agent, + componentClass=self._component_class, + checkName=check_name, + generatedTimestamp=timestamp, + isFatal=True, + isHealthy=False, + errorCode=["GPU_TOTAL_COUNT_MISMATCH"], + entitiesImpacted=entities_impacted, + message=message, + recommendedAction=platformconnector_pb2.RESTART_VM, + nodeName=self._node_name, + metadata=event_metadata, + processingStrategy=self._processing_strategy, + ) + + try: + if self.send_health_event_with_retries([health_event]): + self.entity_cache[key] = CachedEntityState(isFatal=True, isHealthy=False) + log.info("GPU count mismatch event sent successfully") + except Exception as e: + log.error(f"Exception while sending GPU count mismatch event: {e}") + def dcgm_connectivity_failed(self) -> None: """Handle DCGM connectivity failure event.""" with metrics.dcgm_health_events_publish_time_to_grpc_channel.labels( diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_dcgm_watcher/test_dcgm.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_dcgm_watcher/test_dcgm.py index bb61373c2..c3e0cb341 100644 --- a/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_dcgm_watcher/test_dcgm.py +++ b/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_dcgm_watcher/test_dcgm.py @@ -34,6 +34,9 @@ def health_event_occurred(self, health_details: dict[str, dcgm.types.HealthDetai def dcgm_connectivity_failed(self): self.connectivity_failed_called = True + def gpu_count_check_completed(self, sysfs_gpu_pci_addresses: list[str], dcgm_gpu_ids: list[int]): + pass + class TestDCGMHealthChecks: diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_pcie/__init__.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_pcie/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_pcie/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_pcie/test_gpu_counter.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_pcie/test_gpu_counter.py new file mode 100644 index 000000000..4bad6d69c --- /dev/null +++ b/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_pcie/test_gpu_counter.py @@ -0,0 +1,118 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile +import pytest +from gpu_health_monitor.pcie.gpu_counter import get_nvidia_gpu_pci_addresses + + +def _create_fake_device(base: str, addr: str, vendor: str, device_class: str) -> None: + """Create a fake sysfs PCI device entry for testing.""" + device_dir = os.path.join(base, "bus", "pci", "devices", addr) + os.makedirs(device_dir, exist_ok=True) + with open(os.path.join(device_dir, "vendor"), "w") as f: + f.write(vendor) + with open(os.path.join(device_dir, "class"), "w") as f: + f.write(device_class) + + +class TestGetNvidiaGpuPciAddresses: + def test_finds_nvidia_gpus(self): + """Test that NVIDIA GPUs with correct class are detected.""" + with tempfile.TemporaryDirectory() as base: + _create_fake_device(base, "0000:04:00.0", "0x10de", "0x030200") + _create_fake_device(base, "0000:05:00.0", "0x10de", "0x030200") + result = get_nvidia_gpu_pci_addresses(sysfs_base=base) + assert result == ["0000:04:00.0", "0000:05:00.0"] + + def test_filters_nvidia_nvswitch(self): + """Test that NVIDIA NVSwitch devices (class 0x0680) are filtered out.""" + with tempfile.TemporaryDirectory() as base: + _create_fake_device(base, "0000:04:00.0", "0x10de", "0x030200") + _create_fake_device(base, "0000:c5:00.0", "0x10de", "0x068000") # NVSwitch + result = get_nvidia_gpu_pci_addresses(sysfs_base=base) + assert result == ["0000:04:00.0"] + + def test_empty_directory(self): + """Test that an empty sysfs devices directory returns empty list.""" + with tempfile.TemporaryDirectory() as base: + devices_path = os.path.join(base, "bus", "pci", "devices") + os.makedirs(devices_path, exist_ok=True) + result = get_nvidia_gpu_pci_addresses(sysfs_base=base) + assert result == [] + + def test_nonexistent_directory(self): + """Test that a non-existent sysfs path returns empty list without crashing.""" + result = get_nvidia_gpu_pci_addresses(sysfs_base="/nonexistent/path") + assert result == [] + + def test_multiple_gpus_sorted(self): + """Test that multiple GPUs are returned in sorted order.""" + with tempfile.TemporaryDirectory() as base: + # Create in non-sorted order + _create_fake_device(base, "0000:82:00.0", "0x10de", "0x030200") + _create_fake_device(base, "0000:04:00.0", "0x10de", "0x030200") + _create_fake_device(base, "0000:41:00.0", "0x10de", "0x030200") + _create_fake_device(base, "0000:c1:00.0", "0x10de", "0x030200") + result = get_nvidia_gpu_pci_addresses(sysfs_base=base) + assert result == ["0000:04:00.0", "0000:41:00.0", "0000:82:00.0", "0000:c1:00.0"] + + def test_missing_vendor_file(self): + """Test that devices with missing vendor file are skipped.""" + with tempfile.TemporaryDirectory() as base: + _create_fake_device(base, "0000:04:00.0", "0x10de", "0x030200") + # Create device with only class file (no vendor) + device_dir = os.path.join(base, "bus", "pci", "devices", "0000:05:00.0") + os.makedirs(device_dir, exist_ok=True) + with open(os.path.join(device_dir, "class"), "w") as f: + f.write("0x030200") + result = get_nvidia_gpu_pci_addresses(sysfs_base=base) + assert result == ["0000:04:00.0"] + + def test_missing_class_file(self): + """Test that NVIDIA devices with missing class file are skipped.""" + with tempfile.TemporaryDirectory() as base: + _create_fake_device(base, "0000:04:00.0", "0x10de", "0x030200") + # Create device with only vendor file (no class) + device_dir = os.path.join(base, "bus", "pci", "devices", "0000:05:00.0") + os.makedirs(device_dir, exist_ok=True) + with open(os.path.join(device_dir, "vendor"), "w") as f: + f.write("0x10de") + result = get_nvidia_gpu_pci_addresses(sysfs_base=base) + assert result == ["0000:04:00.0"] + + def test_eight_h100_gpus(self): + """Test realistic 8x H100 GPU scenario.""" + with tempfile.TemporaryDirectory() as base: + gpu_addresses = [ + "0000:04:00.0", + "0000:05:00.0", + "0000:41:00.0", + "0000:42:00.0", + "0000:82:00.0", + "0000:83:00.0", + "0000:c1:00.0", + "0000:c2:00.0", + ] + # Also add NVSwitches and other NVIDIA devices + for addr in gpu_addresses: + _create_fake_device(base, addr, "0x10de", "0x030200") + _create_fake_device(base, "0000:c5:00.0", "0x10de", "0x068000") # NVSwitch + _create_fake_device(base, "0000:c6:00.0", "0x10de", "0x068000") # NVSwitch + _create_fake_device(base, "0000:00:00.0", "0x8086", "0x060000") # Intel host bridge + + result = get_nvidia_gpu_pci_addresses(sysfs_base=base) + assert result == gpu_addresses + assert len(result) == 8 diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_platform_connector/test_gpu_count_mismatch.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_platform_connector/test_gpu_count_mismatch.py new file mode 100644 index 000000000..e3fe983cf --- /dev/null +++ b/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_platform_connector/test_gpu_count_mismatch.py @@ -0,0 +1,273 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import tempfile +import time +import unittest +from concurrent import futures +from threading import Event +from typing import Any + +import grpc +from gpu_health_monitor.platform_connector import platform_connector + +from gpu_health_monitor.protos import ( + health_event_pb2 as platformconnector_pb2, + health_event_pb2_grpc as platformconnector_pb2_grpc, +) + +socket_path = "/tmp/nvsentinel.sock" +node_name = "test-node-1" + + +def sample_metadata(): + """Sample GPU metadata with 4 GPUs for testing.""" + return { + "version": "1.0", + "timestamp": "2025-11-07T10:00:00Z", + "node_name": "test-node", + "chassis_serial": "CHASSIS-12345", + "gpus": [ + { + "gpu_id": 0, + "uuid": "GPU-00000000-0000-0000-0000-000000000000", + "pci_address": "0000:04:00.0", + "serial_number": "SN-GPU-0", + "device_name": "NVIDIA H100", + "nvlinks": [], + }, + { + "gpu_id": 1, + "uuid": "GPU-11111111-1111-1111-1111-111111111111", + "pci_address": "0000:05:00.0", + "serial_number": "SN-GPU-1", + "device_name": "NVIDIA H100", + "nvlinks": [], + }, + { + "gpu_id": 2, + "uuid": "GPU-22222222-2222-2222-2222-222222222222", + "pci_address": "0000:41:00.0", + "serial_number": "SN-GPU-2", + "device_name": "NVIDIA H100", + "nvlinks": [], + }, + { + "gpu_id": 3, + "uuid": "GPU-33333333-3333-3333-3333-333333333333", + "pci_address": "0000:82:00.0", + "serial_number": "SN-GPU-3", + "device_name": "NVIDIA H100", + "nvlinks": [], + }, + ], + } + + +def metadata_file(): + """Create a temporary metadata file for testing.""" + f = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") + json.dump(sample_metadata(), f) + return f.name + + +class PlatformConnectorServicer(platformconnector_pb2_grpc.PlatformConnectorServicer): + def __init__(self) -> None: + self.health_events: list[platformconnector_pb2.HealthEvent] = [] + + def HealthEventOccurredV1(self, request: platformconnector_pb2.HealthEvents, context: Any): + self.health_events = list(request.events) + return platformconnector_pb2.HealthEvents() + + +class TestGpuCountMismatch(unittest.TestCase): + def setUp(self): + self.healthEventProcessor = PlatformConnectorServicer() + self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) + platformconnector_pb2_grpc.add_PlatformConnectorServicer_to_server(self.healthEventProcessor, self.server) + self.server.add_insecure_port(f"unix://{socket_path}") + self.server.start() + + self.exit = Event() + self.temp_file_path = metadata_file() + + dcgm_health_conditions_categorization_mapping_config = { + "DCGM_HEALTH_WATCH_PCIE": "Fatal", + "DCGM_HEALTH_WATCH_NVLINK": "Fatal", + } + + self.processor = platform_connector.PlatformConnectorEventProcessor( + socket_path=socket_path, + node_name=node_name, + exit=self.exit, + dcgm_errors_info_dict={}, + state_file_path="statefile", + dcgm_health_conditions_categorization_mapping_config=dcgm_health_conditions_categorization_mapping_config, + metadata_path=self.temp_file_path, + processing_strategy=platformconnector_pb2.EXECUTE_REMEDIATION, + ) + + def tearDown(self): + self.server.stop(0) + if os.path.exists(self.temp_file_path): + os.unlink(self.temp_file_path) + # Clean up the UDS socket file if it exists + if os.path.exists(socket_path): + os.unlink(socket_path) + + def test_mismatch_sends_unhealthy_event(self): + """Test that a GPU count mismatch fires an unhealthy health event with correct fields.""" + # sysfs sees 4 GPUs, DCGM sees 3 (GPU 3 missing from driver) + sysfs_pci_addresses = ["0000:04:00.0", "0000:05:00.0", "0000:41:00.0", "0000:82:00.0"] + dcgm_gpu_ids = [0, 1, 2] + + self.processor.gpu_count_check_completed(sysfs_pci_addresses, dcgm_gpu_ids) + + health_events = self.healthEventProcessor.health_events + assert len(health_events) == 1 + + event = health_events[0] + assert event.checkName == "GpuTotalCountMismatch" + assert event.isFatal is True + assert event.isHealthy is False + assert event.errorCode == ["GPU_TOTAL_COUNT_MISMATCH"] + assert event.recommendedAction == platformconnector_pb2.RESTART_VM + assert event.nodeName == node_name + assert event.agent == "gpu-health-monitor" + assert event.componentClass == "GPU" + assert event.processingStrategy == platformconnector_pb2.EXECUTE_REMEDIATION + + # The missing GPU (0000:82:00.0) should be in entities impacted + assert len(event.entitiesImpacted) == 1 + assert event.entitiesImpacted[0].entityType == "PCI" + assert event.entitiesImpacted[0].entityValue == "0000:82:00.0" + + # Message should include counts and missing addresses + assert "4 GPUs found on PCIe bus" in event.message + assert "3 visible to S/W" in event.message + assert "0000:82:00.0" in event.message + assert "Run nvidia-smi" in event.message + + # Metadata should include chassis serial + assert event.metadata["chassis_serial"] == "CHASSIS-12345" + + def test_match_sends_healthy_event_on_first_cycle(self): + """Test that matching GPU counts send a healthy event on first cycle (cache empty).""" + sysfs_pci_addresses = ["0000:04:00.0", "0000:05:00.0", "0000:41:00.0", "0000:82:00.0"] + dcgm_gpu_ids = [0, 1, 2, 3] + + self.processor.gpu_count_check_completed(sysfs_pci_addresses, dcgm_gpu_ids) + + health_events = self.healthEventProcessor.health_events + assert len(health_events) == 1 + + event = health_events[0] + assert event.checkName == "GpuTotalCountMismatch" + assert event.isHealthy is True + assert event.isFatal is False + assert event.recommendedAction == platformconnector_pb2.NONE + assert "reported no errors" in event.message + + # Verify cache is populated + key = self.processor._build_cache_key("GpuTotalCountMismatch", "GPU_TOTAL_COUNT", "ALL") + assert key in self.processor.entity_cache + assert self.processor.entity_cache[key].isHealthy is True + + def test_match_no_event_on_second_cycle(self): + """Test that matching GPU counts do not send a duplicate event on subsequent cycles.""" + sysfs_pci_addresses = ["0000:04:00.0", "0000:05:00.0", "0000:41:00.0", "0000:82:00.0"] + dcgm_gpu_ids = [0, 1, 2, 3] + + # First cycle: sends healthy event + self.processor.gpu_count_check_completed(sysfs_pci_addresses, dcgm_gpu_ids) + assert len(self.healthEventProcessor.health_events) == 1 + + # Reset the servicer to track new events + self.healthEventProcessor.health_events = [] + + # Second cycle: cache already has healthy state, no event sent + self.processor.gpu_count_check_completed(sysfs_pci_addresses, dcgm_gpu_ids) + assert len(self.healthEventProcessor.health_events) == 0 + + def test_entity_cache_prevents_duplicate_sends(self): + """Test that the entity cache prevents sending duplicate mismatch events.""" + sysfs_pci_addresses = ["0000:04:00.0", "0000:05:00.0", "0000:41:00.0", "0000:82:00.0"] + dcgm_gpu_ids = [0, 1, 2] + + # First call should send the event + self.processor.gpu_count_check_completed(sysfs_pci_addresses, dcgm_gpu_ids) + assert len(self.healthEventProcessor.health_events) == 1 + + # Reset the servicer to track new events + self.healthEventProcessor.health_events = [] + + # Second call should be suppressed by entity cache + self.processor.gpu_count_check_completed(sysfs_pci_addresses, dcgm_gpu_ids) + assert len(self.healthEventProcessor.health_events) == 0 + + def test_mismatch_then_resolved(self): + """Test full lifecycle: mismatch detected then resolved.""" + sysfs_pci_addresses = ["0000:04:00.0", "0000:05:00.0", "0000:41:00.0", "0000:82:00.0"] + dcgm_gpu_ids_missing = [0, 1, 2] + dcgm_gpu_ids_full = [0, 1, 2, 3] + + # First: mismatch (sysfs=4, DCGM=3) + self.processor.gpu_count_check_completed(sysfs_pci_addresses, dcgm_gpu_ids_missing) + events = self.healthEventProcessor.health_events + assert len(events) == 1 + assert events[0].isHealthy is False + assert events[0].checkName == "GpuTotalCountMismatch" + + # Verify cache has the mismatch state + key = self.processor._build_cache_key("GpuTotalCountMismatch", "GPU_TOTAL_COUNT", "ALL") + assert key in self.processor.entity_cache + assert self.processor.entity_cache[key].isHealthy is False + + # Then: resolved (sysfs=4, DCGM=4) + self.healthEventProcessor.health_events = [] + self.processor.gpu_count_check_completed(sysfs_pci_addresses, dcgm_gpu_ids_full) + events = self.healthEventProcessor.health_events + assert len(events) == 1 + resolved_event = events[0] + assert resolved_event.isHealthy is True + assert resolved_event.isFatal is False + assert resolved_event.checkName == "GpuTotalCountMismatch" + assert resolved_event.recommendedAction == platformconnector_pb2.NONE + assert "reported no errors" in resolved_event.message + + # Verify cache is updated + assert self.processor.entity_cache[key].isHealthy is True + + def test_multiple_missing_gpus(self): + """Test mismatch with multiple GPUs missing from DCGM.""" + # sysfs sees 4 GPUs, DCGM sees 2 (GPU 2 and 3 missing) + sysfs_pci_addresses = ["0000:04:00.0", "0000:05:00.0", "0000:41:00.0", "0000:82:00.0"] + dcgm_gpu_ids = [0, 1] + + self.processor.gpu_count_check_completed(sysfs_pci_addresses, dcgm_gpu_ids) + + health_events = self.healthEventProcessor.health_events + assert len(health_events) == 1 + + event = health_events[0] + # Both missing GPUs should be in entities impacted + assert len(event.entitiesImpacted) == 2 + pci_values = [e.entityValue for e in event.entitiesImpacted] + assert "0000:41:00.0" in pci_values + assert "0000:82:00.0" in pci_values + + assert "4 GPUs found on PCIe bus" in event.message + assert "2 visible to S/W" in event.message diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_platform_connector/test_platform_connector.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_platform_connector/test_platform_connector.py index d79b9170d..e521146ef 100644 --- a/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_platform_connector/test_platform_connector.py +++ b/health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_platform_connector/test_platform_connector.py @@ -735,6 +735,148 @@ def test_dcgm_connectivity_restored(self): server.stop(0) + def test_gpu_count_mismatch_sends_unhealthy_event(self): + """Test that sysfs > DCGM GPU count fires an unhealthy GpuTotalCountMismatch event.""" + healthEventProcessor = PlatformConnectorServicer() + server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) + platformconnector_pb2_grpc.add_PlatformConnectorServicer_to_server(healthEventProcessor, server) + server.add_insecure_port(f"unix://{socket_path}") + server.start() + + exit = Event() + + # 4-GPU metadata so the processor can resolve PCI addresses + gpu_count_metadata = { + "version": "1.0", + "timestamp": "2025-11-07T10:00:00Z", + "node_name": "test-node", + "chassis_serial": "CHASSIS-12345", + "gpus": [ + { + "gpu_id": i, + "uuid": f"GPU-{i:08d}", + "pci_address": addr, + "serial_number": f"SN-{i}", + "device_name": "NVIDIA H100", + "nvlinks": [], + } + for i, addr in enumerate(["0000:04:00.0", "0000:05:00.0", "0000:41:00.0", "0000:82:00.0"]) + ], + } + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: + json.dump(gpu_count_metadata, f) + temp_file_path = f.name + + try: + processor = platform_connector.PlatformConnectorEventProcessor( + socket_path=socket_path, + node_name=node_name, + exit=exit, + dcgm_errors_info_dict={}, + state_file_path="statefile", + dcgm_health_conditions_categorization_mapping_config={"DCGM_HEALTH_WATCH_PCIE": "Fatal"}, + metadata_path=temp_file_path, + processing_strategy=platformconnector_pb2.EXECUTE_REMEDIATION, + ) + + # sysfs sees 4 GPUs, DCGM sees 3 (GPU 3 missing from driver) + sysfs = ["0000:04:00.0", "0000:05:00.0", "0000:41:00.0", "0000:82:00.0"] + processor.gpu_count_check_completed(sysfs, [0, 1, 2]) + + events = healthEventProcessor.health_events + assert len(events) == 1 + + event = events[0] + assert event.checkName == "GpuTotalCountMismatch" + assert event.isFatal is True + assert event.isHealthy is False + assert event.errorCode == ["GPU_TOTAL_COUNT_MISMATCH"] + assert event.recommendedAction == platformconnector_pb2.RESTART_VM + assert event.nodeName == node_name + assert len(event.entitiesImpacted) == 1 + assert event.entitiesImpacted[0].entityType == "PCI" + assert event.entitiesImpacted[0].entityValue == "0000:82:00.0" + assert "4 GPUs found on PCIe bus" in event.message + assert "3 visible to S/W" in event.message + assert event.metadata["chassis_serial"] == "CHASSIS-12345" + + server.stop(0) + finally: + if os.path.exists(temp_file_path): + os.unlink(temp_file_path) + + def test_gpu_count_mismatch_then_resolved(self): + """Test full lifecycle: mismatch detected then resolved.""" + healthEventProcessor = PlatformConnectorServicer() + server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) + platformconnector_pb2_grpc.add_PlatformConnectorServicer_to_server(healthEventProcessor, server) + server.add_insecure_port(f"unix://{socket_path}") + server.start() + + exit = Event() + + gpu_count_metadata = { + "version": "1.0", + "timestamp": "2025-11-07T10:00:00Z", + "node_name": "test-node", + "chassis_serial": "CHASSIS-12345", + "gpus": [ + { + "gpu_id": i, + "uuid": f"GPU-{i:08d}", + "pci_address": addr, + "serial_number": f"SN-{i}", + "device_name": "NVIDIA H100", + "nvlinks": [], + } + for i, addr in enumerate(["0000:04:00.0", "0000:05:00.0", "0000:41:00.0", "0000:82:00.0"]) + ], + } + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: + json.dump(gpu_count_metadata, f) + temp_file_path = f.name + + try: + processor = platform_connector.PlatformConnectorEventProcessor( + socket_path=socket_path, + node_name=node_name, + exit=exit, + dcgm_errors_info_dict={}, + state_file_path="statefile", + dcgm_health_conditions_categorization_mapping_config={"DCGM_HEALTH_WATCH_PCIE": "Fatal"}, + metadata_path=temp_file_path, + processing_strategy=platformconnector_pb2.EXECUTE_REMEDIATION, + ) + + sysfs = ["0000:04:00.0", "0000:05:00.0", "0000:41:00.0", "0000:82:00.0"] + + # Step 1: mismatch (sysfs=4, DCGM=3) + processor.gpu_count_check_completed(sysfs, [0, 1, 2]) + events = healthEventProcessor.health_events + assert len(events) == 1 + assert events[0].isHealthy is False + assert events[0].checkName == "GpuTotalCountMismatch" + + key = processor._build_cache_key("GpuTotalCountMismatch", "GPU_TOTAL_COUNT", "ALL") + assert key in processor.entity_cache + assert processor.entity_cache[key].isHealthy is False + + # Step 2: resolved (sysfs=4, DCGM=4) + healthEventProcessor.health_events = [] + processor.gpu_count_check_completed(sysfs, [0, 1, 2, 3]) + events = healthEventProcessor.health_events + assert len(events) == 1 + assert events[0].isHealthy is True + assert events[0].checkName == "GpuTotalCountMismatch" + assert events[0].recommendedAction == platformconnector_pb2.NONE + assert "reported no errors" in events[0].message + assert processor.entity_cache[key].isHealthy is True + + server.stop(0) + finally: + if os.path.exists(temp_file_path): + os.unlink(temp_file_path) + def test_event_retry_and_cache_cleanup_when_platform_connector_down(self) -> None: """Test when platform connector goes down and comes back up.""" import tempfile