NVIDIA · deesharma24 · Feb 11, 2026
diff --git a/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-3.x.yaml b/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-3.x.yaml
@@ -60,6 +60,8 @@ spec:
             - {{ .Values.global.metadataPath | quote }}
             - --processing-strategy
             - {{ .Values.processingStrategy | quote }}
+            - --gpu-count-check-enabled
+            - {{ .Values.gpuCountCheck.enabled | quote }}
           securityContext:
             runAsUser: 0
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}-dcgm-3.x"

diff --git a/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-4.x.yaml b/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-4.x.yaml
@@ -60,6 +60,8 @@ spec:
             - {{ .Values.global.metadataPath | quote }}
             - --processing-strategy
             - {{ .Values.processingStrategy | quote }}
+            - --gpu-count-check-enabled
+            - {{ .Values.gpuCountCheck.enabled | quote }}
           securityContext:
             runAsUser: 0
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}-dcgm-4.x"

diff --git a/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/values.yaml b/distros/kubernetes/nvsentinel/charts/gpu-health-monitor/values.yaml
@@ -52,6 +52,12 @@ tolerations: []
 
 podAnnotations: {}
 
+# GPU count check configuration
+# Compares PCIe bus GPU count (sysfs) vs driver-visible GPU count (DCGM)
+# and sends a health event on mismatch
+gpuCountCheck:
+  enabled: true
+
 verbose: "False"
 
 # Processing strategy for health events

diff --git a/docs/gpu-health-monitor.md b/docs/gpu-health-monitor.md
@@ -70,11 +70,46 @@ The monitor checks multiple GPU health aspects through DCGM. Below are some of t
 
 > **Note**: The specific health watches available depend on your DCGM version and GPU model. NVIDIA regularly adds new health checks and monitoring capabilities to DCGM. Consult your DCGM documentation for the complete list of supported health watches for your environment.
 
+### GPU Count Mismatch Detection (GpuTotalCountMismatch)
+
+#### Why Is This Needed?
+
+DCGM health watches are powerful but they can only monitor GPUs that the NVIDIA driver has already initialized. If a GPU fails during driver initialization at boot, or silently falls off the PCIe bus at runtime without generating an XID error, DCGM simply never sees it -- the GPU vanishes from the software view with no health event generated. This creates a blind spot: a node can be running with fewer GPUs than expected and no existing check raises an alarm.
+
+The GPU Count Mismatch check closes this gap by comparing the **hardware-level** GPU count (from the Linux sysfs PCIe device tree) against the **software-level** GPU count (from DCGM). Because sysfs reflects what the PCIe bus enumerates at boot time, it sees GPUs that the NVIDIA driver may have failed to initialize. Any discrepancy between these two counts indicates a GPU that is physically present on the PCIe bus but not usable by workloads.
+
+#### Coverage
+
+The table below illustrates which failure scenarios are caught by existing syslog-health-check versus this GpuTotalCountMismatch:
+
+| Scenario                                        | Xid79/GpuFallenOffBus       | GpuTotalCountMismatch              |
+|-------------------------------------------------|-----------------------------|------------------------------------|
+| GPU fails driver init at boot (no XID)          | Miss                        | Catches (sysfs > DCGM)             |
+| GPU falls off at runtime (no XID logged)        | Miss                        | Catches (sysfs cached > DCGM live) |
+| GPU falls off at runtime (XID 79 logged)        | SysLogsGPUFallenOff catches | Also catches                       |
+| GPU physically dead at boot (no PCIe response)  | Miss                        | Miss (invisible to both)           |
+
+#### How It Works
+
+1. **At startup**, the monitor scans sysfs (`/sys/bus/pci/devices/`) for all NVIDIA GPU PCI addresses (vendor `0x10de`, class prefix `0x0302`). This count is cached because sysfs reflects the boot-time PCIe enumeration and does not change during a boot session.
+2. **Every polling cycle**, after a successful DCGM health check, the monitor compares the cached sysfs GPU count against the live DCGM GPU count.
+3. **If a mismatch is detected** (sysfs count != DCGM count), a fatal `GpuTotalCountMismatch` health event is sent. The event message includes the specific PCI addresses of GPUs that are on the PCIe bus but missing from the driver, making diagnosis straightforward.
+4. **If the counts match** (or a previous mismatch resolves), a healthy event is sent to clear any prior mismatch alert.
+
+State-change deduplication ensures that only transitions (healthy to unhealthy or vice versa) generate events, preventing event flooding during repeated polling cycles.
+
+#### Configuration
+
+The GPU count check is enabled by default. It can be disabled via the `gpu_count_check_enabled` parameter if your environment has a known reason for GPU count differences (e.g., intentionally partitioned GPUs).
+
 ## Key Features
 
 ### DCGM Integration
 Leverages NVIDIA's Data Center GPU Manager for comprehensive GPU health monitoring with proven reliability.
 
+### GPU Count Mismatch Detection
+Compares hardware-level PCIe GPU enumeration (sysfs) against DCGM's software-visible GPU count to catch GPUs that fail driver initialization or silently drop off the bus -- scenarios that DCGM health watches alone cannot detect.
+
 ### Automatic State Management
 Maintains entity-level cache to track reported issues and avoid sending duplicate events within a single boot session.
 

diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/cli.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/cli.py
@@ -78,6 +78,13 @@ def _init_event_processor(
     help="Event processing strategy: EXECUTE_REMEDIATION or STORE_ONLY",
     required=False,
 )
+@click.option(
+    "--gpu-count-check-enabled",
+    type=bool,
+    default=True,
+    help="Enable GPU count check comparing PCIe bus vs driver-visible GPUs",
+    required=False,
+)
 def cli(
     dcgm_addr,
     dcgm_error_mapping_config_file,
@@ -88,6 +95,7 @@ def cli(
     dcgm_k8s_service_enabled,
     metadata_path,
     processing_strategy,
+    gpu_count_check_enabled,
 ):
     exit = Event()
     config = configparser.ConfigParser()
@@ -164,6 +172,7 @@ def process_exit_signal(signum, frame):
         poll_interval_seconds=int(dcgm_config["PollIntervalSeconds"]),
         callbacks=enabled_event_processors,
         dcgm_k8s_service_enabled=dcgm_k8s_service_enabled,
+        gpu_count_check_enabled=gpu_count_check_enabled,
     )
     dcgm_watcher.start([], exit)
 

diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/dcgm.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/dcgm.py
@@ -23,6 +23,8 @@
 import time
 import os
 
+from gpu_health_monitor.pcie import get_nvidia_gpu_pci_addresses
+
 DELAY, MULTIPLIER, MAX_DELAY = 2, 1.5, 120
 DCGM_4_PYTHON_PATH = "/usr/share/datacenter-gpu-manager-4/bindings/python3"
 
@@ -34,6 +36,7 @@ def __init__(
         poll_interval_seconds: int,
         callbacks: list[types.CallbackInterface],
         dcgm_k8s_service_enabled: bool,
+        gpu_count_check_enabled: bool = True,
     ) -> None:
         self._addr = addr
         self._poll_interval_seconds = poll_interval_seconds
@@ -48,6 +51,7 @@ def __init__(
 
         self._callback_thread_pool = ThreadPoolExecutor()
         self._dcgm_k8s_service_enabled = dcgm_k8s_service_enabled
+        self._gpu_count_check_enabled = gpu_count_check_enabled
 
     def _get_available_health_watches(self) -> dict[int, str]:
         health_watches = {}
@@ -269,6 +273,23 @@ def start(self, fields_to_monitor: list[str], exit: Event) -> None:
         dcgm_group = None
         gpu_ids = []
 
+        # Cache sysfs GPU addresses (static per boot) if GPU count check is enabled
+        sysfs_gpu_pci_addresses = []
+        metrics.gpu_count_check_sysfs_scan_failure.set(0)
+        if self._gpu_count_check_enabled:
+            sysfs_gpu_pci_addresses = get_nvidia_gpu_pci_addresses()
+            if not sysfs_gpu_pci_addresses:
+                log.warning(
+                    "Sysfs GPU enumeration returned 0 GPUs (/sys may not be mounted or "
+                    "accessible). Disabling GPU count check to avoid false results."
+                )
+                metrics.gpu_count_check_sysfs_scan_failure.set(1)
+                self._gpu_count_check_enabled = False
+            else:
+                log.info(f"Sysfs GPU count (PCIe hardware): {len(sysfs_gpu_pci_addresses)}")
+        else:
+            log.info("GPU count check is disabled")
+
         # Initial DCGM handle and monitoring setup
         while not exit.is_set():
             with metrics.overall_reconcile_loop_time.time():
@@ -300,6 +321,18 @@ def start(self, fields_to_monitor: list[str], exit: Event) -> None:
                             [health_status, gpu_ids],
                         )
 
+                        # GPU count check: compare sysfs (hardware) vs DCGM
+                        if self._gpu_count_check_enabled:
+                            if len(sysfs_gpu_pci_addresses) != len(gpu_ids):
+                                log.warning(
+                                    f"GPU count mismatch detected: sysfs={len(sysfs_gpu_pci_addresses)}, "
+                                    f"DCGM={len(gpu_ids)}"
+                                )
+                            self._fire_callback_funcs(
+                                types.CallbackInterface.gpu_count_check_completed.__name__,
+                                [sysfs_gpu_pci_addresses, gpu_ids],
+                            )
+
             log.debug("Waiting till next cycle")
             exit.wait(self._poll_interval_seconds)
 

diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/metrics.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/metrics.py
@@ -35,3 +35,7 @@
     "Number of times an error has occurred",
     labelnames=["error_name"],
 )
+gpu_count_check_sysfs_scan_failure = Gauge(
+    "gpu_count_check_sysfs_scan_failure",
+    "Set to 1 when sysfs GPU enumeration returns zero GPUs, disabling the GPU count check",
+)
diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/types.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/types.py
@@ -49,3 +49,12 @@ def health_event_occurred(self, health_details: dict[str, HealthDetails], gpu_id
     def dcgm_connectivity_failed(self):
         """Called when DCGM connectivity fails during health check."""
         pass
+
+    @abc.abstractmethod
+    def gpu_count_check_completed(self, sysfs_gpu_pci_addresses: list[str], dcgm_gpu_ids: list[int]) -> None:
+        """Called every cycle after health check with sysfs and DCGM GPU info.
+
+        The implementation compares counts and handles mismatch detection
+        and resolution internally.
+        """
+        pass
diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/pcie/__init__.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/pcie/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .gpu_counter import get_nvidia_gpu_pci_addresses
+
+__all__ = ["get_nvidia_gpu_pci_addresses"]
diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/pcie/gpu_counter.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/pcie/gpu_counter.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging as log
+
+NVIDIA_VENDOR_ID = "0x10de"
+GPU_CLASS_PREFIX = "0x0302"  # GPU Class prefix for a100,h100 etc
+
+
+def get_nvidia_gpu_pci_addresses(sysfs_base: str = "/sys") -> list[str]:
+    """Scan sysfs for NVIDIA GPU PCI addresses.
+
+    Reads /sys/bus/pci/devices/*/vendor and /sys/bus/pci/devices/*/class
+    to find NVIDIA GPUs (vendor 0x10de, class prefix 0x0302).
+
+    Args:
+        sysfs_base: Base path for sysfs. Default "/sys". Override for testing.
+
+    Returns:
+        List of PCI addresses like ['0000:04:00.0', '0000:05:00.0', ...]
+    """
+    devices_path = os.path.join(sysfs_base, "bus", "pci", "devices")
+    gpu_addresses = []
+
+    try:
+        entries = os.listdir(devices_path)
+    except OSError as e:
+        log.exception(f"Failed to read PCI devices directory {devices_path}")
+        return []
+
+    for entry in sorted(entries):
+        device_path = os.path.join(devices_path, entry)
+
+        vendor = _read_sysfs_file(os.path.join(device_path, "vendor"))
+        if vendor != NVIDIA_VENDOR_ID:
+            continue
+
+        device_class = _read_sysfs_file(os.path.join(device_path, "class"))
+        if not device_class or not device_class.startswith(GPU_CLASS_PREFIX):
+            continue
+
+        gpu_addresses.append(entry)  # entry is the PCI address like "0000:04:00.0"
+        log.debug(f"Found NVIDIA GPU on PCIe bus: {entry} (class={device_class})")
+
+    log.info(f"Sysfs GPU enumeration complete: {len(gpu_addresses)} NVIDIA GPUs found")
+    return gpu_addresses
+
+
+def _read_sysfs_file(path: str) -> str:
+    """Read and strip a sysfs file. Returns empty string on error."""
+    try:
+        with open(path, "r") as f:
+            return f.read().strip()
+    except OSError:
+        return ""