From 4b5fb32dea460fbaad7eae6f21ad2d6f5e2b0848 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 07:58:24 -0500
Subject: [PATCH 01/20] Add multi-cuda wheel build

---
 .../build-and-test-python-wheels.yml          |  19 +-
 .github/workflows/build-wheels.yml            |   9 +-
 .gitignore                                    |  17 ++
 ci/build_multi_cuda_wheel.sh                  | 199 ++++++++++++++++++
 ci/build_pynvbench_wheel_for_cuda.sh          |  77 +++++++
 ci/test_pynvbench.sh                          |   6 +-
 ci/test_pynvbench_inner.sh                    |  15 ++
 python/CMakeLists.txt                         |  27 ++-
 python/cuda/bench/__init__.py                 |  73 +++++--
 python/pyproject.toml                         |  20 +-
 10 files changed, 408 insertions(+), 54 deletions(-)
 create mode 100755 ci/build_multi_cuda_wheel.sh
 create mode 100755 ci/build_pynvbench_wheel_for_cuda.sh

diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml
index d51f504e..377e0d6a 100644
--- a/.github/workflows/build-and-test-python-wheels.yml
+++ b/.github/workflows/build-and-test-python-wheels.yml
@@ -9,9 +9,9 @@ defaults:
     shell: bash --noprofile --norc -euo pipefail {0}
 
 jobs:
-  # Build wheels for all CUDA/Python combinations
+  # Build multi-CUDA wheels (one wheel per Python version, includes both CUDA 12 & 13)
   build-wheels:
-    name: Build wheel (CUDA ${{ matrix.cuda }}, Python ${{ matrix.python }})
+    name: Build wheel (Python ${{ matrix.python }})
     runs-on: ubuntu-latest
     permissions:
       id-token: write
@@ -19,7 +19,6 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda: ['12', '13']
         python: ['3.10', '3.11', '3.12', '3.13']
 
     steps:
@@ -32,19 +31,20 @@ jobs:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
 
-      - name: Build wheel
+      - name: Build multi-CUDA wheel
         run: |
-          bash ci/build_pynvbench_wheel.sh -py-version ${{ matrix.python }} -cuda-version ${{ matrix.cuda }}
+          bash ci/build_multi_cuda_wheel.sh -py-version ${{ matrix.python }}
 
       - name: Upload wheel artifact
         uses: actions/upload-artifact@v4
         with:
-          name: wheel-pynvbench-cu${{ matrix.cuda }}-py${{ matrix.python }}
+          name: wheel-pynvbench-py${{ matrix.python }}
           path: wheelhouse/*.whl
           retention-days: 7
           if-no-files-found: error
 
-  # Test wheels for all CUDA/Python combinations
+  # Test wheels on all CUDA/Python combinations
+  # Each wheel contains extensions for both CUDA 12 & 13, runtime detection picks the right one
   test-wheels:
     name: Test wheel (CUDA ${{ matrix.cuda }}, Python ${{ matrix.python }})
     needs: build-wheels
@@ -68,7 +68,7 @@ jobs:
       - name: Download wheel artifact
         uses: actions/download-artifact@v4
         with:
-          name: wheel-pynvbench-cu${{ matrix.cuda }}-py${{ matrix.python }}
+          name: wheel-pynvbench-py${{ matrix.python }}
           path: wheelhouse
 
       - name: Test wheel
@@ -76,8 +76,10 @@ jobs:
           # Use the same rapidsai/ci-wheel Docker image as build
           if [[ "${{ matrix.cuda }}" == "12" ]]; then
             cuda_full_version="12.9.1"
+            cuda_extra="[cu12]"
           else
             cuda_full_version="13.0.1"
+            cuda_extra="[cu13]"
           fi
 
           docker run --rm \
@@ -86,6 +88,7 @@ jobs:
             --mount type=bind,source=$(pwd),target=/workspace/ \
             --env py_version=${{ matrix.python }} \
             --env cuda_version=${{ matrix.cuda }} \
+            --env cuda_extra="${cuda_extra}" \
             rapidsai/ci-wheel:25.12-cuda${cuda_full_version}-rockylinux8-py${{ matrix.python }} \
             /workspace/ci/test_pynvbench_inner.sh
 
diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml
index 37dfb3d7..4ae97491 100644
--- a/.github/workflows/build-wheels.yml
+++ b/.github/workflows/build-wheels.yml
@@ -10,7 +10,7 @@ defaults:
 
 jobs:
   build-wheels:
-    name: Build wheel (CUDA ${{ matrix.cuda }}, Python ${{ matrix.python }})
+    name: Build wheel (Python ${{ matrix.python }})
     runs-on: ubuntu-latest
     permissions:
       id-token: write
@@ -18,7 +18,6 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda: ['12', '13']
         python: ['3.10', '3.11', '3.12', '3.13']
 
     steps:
@@ -31,14 +30,14 @@ jobs:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
 
-      - name: Build wheel
+      - name: Build multi-CUDA wheel
         run: |
-          bash ci/build_pynvbench_wheel.sh -py-version ${{ matrix.python }} -cuda-version ${{ matrix.cuda }}
+          bash ci/build_multi_cuda_wheel.sh -py-version ${{ matrix.python }}
 
       - name: Upload wheel artifact
         uses: actions/upload-artifact@v4
         with:
-          name: wheel-pynvbench-cu${{ matrix.cuda }}-py${{ matrix.python }}
+          name: wheel-pynvbench-py${{ matrix.python }}
           path: wheelhouse/*.whl
           retention-days: 7
           if-no-files-found: error
diff --git a/.gitignore b/.gitignore
index 2f64907d..a5c769c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,11 +8,28 @@ cmake-build-*
 *~
 compile_commands.json
 CMakeUserPresets.json
+<<<<<<< HEAD
 
 # Python wheel builds
 wheelhouse/
+=======
+<<<<<<< Updated upstream
+=======
+
+# Python wheel builds
+wheelhouse/
+wheelhouse_merged/
+wheelhouse_final/
+dist/
+dist_*/
+>>>>>>> 1ab0a2e (Add multi-cuda wheel build)
 *.whl
 *.egg-info/
 __pycache__/
 *.pyc
 *.pyo
+<<<<<<< HEAD
+=======
+.pytest_cache/
+>>>>>>> Stashed changes
+>>>>>>> 1ab0a2e (Add multi-cuda wheel build)
diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh
new file mode 100755
index 00000000..af0ca61e
--- /dev/null
+++ b/ci/build_multi_cuda_wheel.sh
@@ -0,0 +1,199 @@
+#!/bin/bash
+# Apache 2.0 License
+# Copyright 2024-2025 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 with the LLVM exception
+# (the "License"); you may not use this file except in compliance with
+# the License.
+#
+# You may obtain a copy of the License at
+#
+#     http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+usage="Usage: $0 -py-version <python_version> [additional options...]"
+
+source "$ci_dir/util/python/common_arg_parser.sh"
+parse_python_args "$@"
+
+# Check if py_version was provided (this script requires it)
+require_py_version "$usage" || exit 1
+
+echo "Docker socket: " $(ls /var/run/docker.sock)
+
+# Set HOST_WORKSPACE if not already set (for local runs)
+if [[ -z "${HOST_WORKSPACE:-}" ]]; then
+  # Get the repository root
+  HOST_WORKSPACE="$(cd "${ci_dir}/.." && pwd)"
+  echo "Setting HOST_WORKSPACE to: $HOST_WORKSPACE"
+fi
+
+if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+  # Prepare mount points etc for getting artifacts in/out of the container.
+  source "$ci_dir/util/artifacts/common.sh"
+  # Note that these mounts use the runner (not the devcontainer) filesystem for
+  # source directories because of docker-out-of-docker quirks.
+  # The workflow-job GH actions make sure that they exist before running any
+  # scripts.
+  action_mounts=$(cat <<EOF
+    --mount type=bind,source=${ARTIFACT_ARCHIVES},target=${ARTIFACT_ARCHIVES} \
+    --mount type=bind,source=${ARTIFACT_UPLOAD_STAGE},target=${ARTIFACT_UPLOAD_STAGE}
+EOF
+)
+
+else
+  # If not running in GitHub Actions, we don't need to set up artifact mounts.
+  action_mounts=""
+fi
+
+# pynvbench must be built in a container that can produce manylinux wheels,
+# and has the CUDA toolkit installed. We use the rapidsai/ci-wheel image for this.
+# We build separate wheels using separate containers for each CUDA version,
+# then merge them into a single wheel.
+
+readonly cuda12_version=12.9.1
+readonly cuda13_version=13.0.1
+readonly devcontainer_version=25.12
+readonly devcontainer_distro=rockylinux8
+
+if [[ "$(uname -m)" == "aarch64" ]]; then
+  readonly cuda12_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda12_version}-${devcontainer_distro}-py${py_version}-arm64
+  readonly cuda13_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda13_version}-${devcontainer_distro}-py${py_version}-arm64
+else
+  readonly cuda12_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda12_version}-${devcontainer_distro}-py${py_version}
+  readonly cuda13_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda13_version}-${devcontainer_distro}-py${py_version}
+fi
+
+mkdir -p wheelhouse
+
+for ctk in 12 13; do
+  image=$(eval echo \$cuda${ctk}_image)
+  echo "::group::⚒️ Building CUDA ${ctk} wheel on ${image}"
+  (
+    set -x
+    docker pull $image
+    docker run --rm -i \
+        --workdir /workspace/python \
+        --mount type=bind,source=${HOST_WORKSPACE},target=/workspace/ \
+        ${action_mounts} \
+        --env py_version=${py_version} \
+        --env GITHUB_ACTIONS=${GITHUB_ACTIONS:-} \
+        --env GITHUB_RUN_ID=${GITHUB_RUN_ID:-} \
+        --env JOB_ID=${JOB_ID:-} \
+        $image \
+        /workspace/ci/build_pynvbench_wheel_for_cuda.sh
+    # Prevent GHA runners from exhausting available storage with leftover images:
+    if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+      docker rmi -f $image
+    fi
+  )
+  echo "::endgroup::"
+done
+
+echo "Merging CUDA wheels..."
+
+# Detect python command
+if command -v python &> /dev/null; then
+  PYTHON=python
+elif command -v python3 &> /dev/null; then
+  PYTHON=python3
+else
+  echo "Error: No python found"
+  exit 1
+fi
+
+# Needed for unpacking and repacking wheels.
+$PYTHON -m pip install --break-system-packages wheel
+
+# Find the built wheels
+cu12_wheel=$(find wheelhouse -name "*cu12*.whl" | head -1)
+cu13_wheel=$(find wheelhouse -name "*cu13*.whl" | head -1)
+
+if [[ -z "$cu12_wheel" ]]; then
+  echo "Error: CUDA 12 wheel not found in wheelhouse/"
+  ls -la wheelhouse/
+  exit 1
+fi
+
+if [[ -z "$cu13_wheel" ]]; then
+  echo "Error: CUDA 13 wheel not found in wheelhouse/"
+  ls -la wheelhouse/
+  exit 1
+fi
+
+echo "Found CUDA 12 wheel: $cu12_wheel"
+echo "Found CUDA 13 wheel: $cu13_wheel"
+
+# Convert to absolute paths before changing directory
+cu12_wheel=$(readlink -f "$cu12_wheel")
+cu13_wheel=$(readlink -f "$cu13_wheel")
+
+# Merge the wheels manually (similar to CCCL)
+mkdir -p wheelhouse_merged
+cd wheelhouse_merged
+
+# Unpack CUDA 12 wheel (this will be our base)
+$PYTHON -m wheel unpack "$cu12_wheel"
+base_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1)
+
+# Unpack CUDA 13 wheel into a temporary subdirectory
+mkdir cu13_tmp
+cd cu13_tmp
+$PYTHON -m wheel unpack "$cu13_wheel"
+cu13_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1)
+
+# Copy the CUDA 13 extension into the base wheel
+cp "$cu13_dir"/cuda/bench/_nvbench_cu13*.so "../$base_dir/cuda/bench/"
+
+# Go back and clean up
+cd ..
+rm -rf cu13_tmp
+
+# Repack the merged wheel
+$PYTHON -m wheel pack "$base_dir"
+
+cd ..
+
+# Install auditwheel and repair the merged wheel
+$PYTHON -m pip install --break-system-packages patchelf auditwheel
+for wheel in wheelhouse_merged/pynvbench-*.whl; do
+    echo "Repairing merged wheel: $wheel"
+    $PYTHON -m auditwheel repair \
+        --exclude 'libcuda.so.1' \
+        --exclude 'libnvidia-ml.so.1' \
+        "$wheel" \
+        --wheel-dir wheelhouse_final
+done
+
+# Clean up intermediate files and move only the final merged wheel to wheelhouse
+rm -rf wheelhouse/*  # Clean existing wheelhouse
+mkdir -p wheelhouse
+
+# Move only the final repaired merged wheel
+if ls wheelhouse_final/pynvbench-*.whl 1> /dev/null 2>&1; then
+    mv wheelhouse_final/pynvbench-*.whl wheelhouse/
+    echo "Final merged wheel moved to wheelhouse"
+else
+    echo "No final repaired wheel found, moving unrepaired merged wheel"
+    mv wheelhouse_merged/pynvbench-*.whl wheelhouse/
+fi
+
+# Clean up temporary directories
+rm -rf wheelhouse_merged wheelhouse_final
+
+echo "Final wheels in wheelhouse:"
+ls -la wheelhouse/
+
+if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+  wheel_artifact_name="$(ci/util/workflow/get_wheel_artifact_name.sh)"
+  ci/util/artifacts/upload.sh $wheel_artifact_name 'wheelhouse/.*'
+fi
diff --git a/ci/build_pynvbench_wheel_for_cuda.sh b/ci/build_pynvbench_wheel_for_cuda.sh
new file mode 100755
index 00000000..0e3d4b7d
--- /dev/null
+++ b/ci/build_pynvbench_wheel_for_cuda.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Apache 2.0 License
+# Copyright 2024-2025 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 with the LLVM exception
+# (the "License"); you may not use this file except in compliance with
+# the License.
+#
+# You may obtain a copy of the License at
+#
+#     http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euxo pipefail
+
+# Target script for `docker run` command in build_multi_cuda_wheel.sh
+# This script builds a single wheel for the container's CUDA version
+# The /workspace pathnames are hard-wired here.
+
+# Install GCC 13 toolset (needed for the build)
+/workspace/ci/util/retry.sh 5 30 dnf -y install gcc-toolset-13-gcc gcc-toolset-13-gcc-c++
+echo -e "#!/bin/bash\nsource /opt/rh/gcc-toolset-13/enable" >/etc/profile.d/enable_devtools.sh
+source /etc/profile.d/enable_devtools.sh
+
+# Check what's available
+which gcc
+gcc --version
+which nvcc
+nvcc --version
+
+# Set up Python environment
+source /workspace/ci/pyenv_helper.sh
+setup_python_env "${py_version}"
+which python
+python --version
+echo "Done setting up python env"
+
+# Ensure we have full git history for setuptools_scm
+if $(git rev-parse --is-shallow-repository); then
+  git fetch --unshallow
+fi
+
+cd /workspace/python
+
+# Determine CUDA version from nvcc
+cuda_version=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+' | cut -d. -f1)
+echo "Detected CUDA version: ${cuda_version}"
+
+# Configure compilers
+export CXX="$(which g++)"
+export CUDACXX="$(which nvcc)"
+export CUDAHOSTCXX="$(which g++)"
+
+# Set CUDA suffix for extension naming
+export PYNVBENCH_CUDA_SUFFIX="_cu${cuda_version}"
+
+# Build the wheel
+python -m pip wheel --no-deps --verbose --wheel-dir dist .
+
+# Rename wheel to include CUDA version suffix
+for wheel in dist/pynvbench-*.whl; do
+    if [[ -f "$wheel" ]]; then
+        base_name=$(basename "$wheel" .whl)
+        new_name="${base_name}.cu${cuda_version}.whl"
+        mv "$wheel" "dist/${new_name}"
+        echo "Renamed wheel to: ${new_name}"
+    fi
+done
+
+# Move wheel to output directory
+mkdir -p /workspace/wheelhouse
+mv dist/pynvbench-*.cu*.whl /workspace/wheelhouse/
diff --git a/ci/test_pynvbench.sh b/ci/test_pynvbench.sh
index b021f0b5..ec3e4573 100755
--- a/ci/test_pynvbench.sh
+++ b/ci/test_pynvbench.sh
@@ -40,11 +40,13 @@ if [[ -z "$cuda_version" ]]; then
     exit 1
 fi
 
-# Map cuda_version to full version
+# Map cuda_version to full version and set CUDA extra
 if [[ "$cuda_version" == "12" ]]; then
     cuda_full_version="12.9.1"
+    cuda_extra="[cu12]"
 elif [[ "$cuda_version" == "13" ]]; then
     cuda_full_version="13.0.1"
+    cuda_extra="[cu13]"
 else
     echo "Error: Unsupported CUDA version: $cuda_version"
     exit 1
@@ -66,9 +68,11 @@ echo "::group::🧪 Testing CUDA ${cuda_version} wheel on ${cuda_image}"
   docker pull $cuda_image
   docker run --rm -i \
       --workdir /workspace \
+      --gpus all \
       --mount type=bind,source=$(pwd),target=/workspace/ \
       --env py_version=${py_version} \
       --env cuda_version=${cuda_version} \
+      --env cuda_extra="${cuda_extra}" \
       $cuda_image \
       /workspace/ci/test_pynvbench_inner.sh
   # Prevent GHA runners from exhausting available storage with leftover images:
diff --git a/ci/test_pynvbench_inner.sh b/ci/test_pynvbench_inner.sh
index 80748fa9..02d8ee0b 100755
--- a/ci/test_pynvbench_inner.sh
+++ b/ci/test_pynvbench_inner.sh
@@ -24,19 +24,34 @@ echo "CUDA version: $(nvcc --version | grep release)"
 # Wheel should be in /workspace/wheelhouse (downloaded by workflow or built locally)
 WHEELHOUSE_DIR="/workspace/wheelhouse"
 
+<<<<<<< HEAD
 # Find and install pynvbench wheel
 # Look for .cu${cuda_version} in the version string (e.g., pynvbench-0.0.1.dev1+g123.cu12-...)
 PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*.cu${cuda_version}-*.whl 2>/dev/null | head -1)"
 if [[ -z "$PYNVBENCH_WHEEL_PATH" ]]; then
     echo "Error: No pynvbench wheel found in ${WHEELHOUSE_DIR}"
     echo "Looking for: pynvbench-*.cu${cuda_version}-*.whl"
+=======
+# Find the pynvbench wheel (multi-CUDA wheel, no CUDA version in filename)
+PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*.whl 2>/dev/null | head -1)"
+if [[ -z "$PYNVBENCH_WHEEL_PATH" ]]; then
+    echo "Error: No pynvbench wheel found in ${WHEELHOUSE_DIR}"
+>>>>>>> 1ab0a2e (Add multi-cuda wheel build)
     echo "Contents of ${WHEELHOUSE_DIR}:"
     ls -la ${WHEELHOUSE_DIR}/ || true
     exit 1
 fi
 
+<<<<<<< HEAD
 echo "Installing wheel: $PYNVBENCH_WHEEL_PATH"
 python -m pip install "${PYNVBENCH_WHEEL_PATH}[test]"
+=======
+# Determine which CUDA extra to install (defaults to cu12 if not specified)
+CUDA_EXTRA="${cuda_extra:-[cu${cuda_version}]}"
+
+echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${CUDA_EXTRA}"
+python -m pip install "${PYNVBENCH_WHEEL_PATH}${CUDA_EXTRA},test"
+>>>>>>> 1ab0a2e (Add multi-cuda wheel build)
 
 # Run tests
 cd "/workspace/python/test/"
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b18f7ef1..c0dda5a0 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -25,15 +25,28 @@ CPMAddPackage(
 
 CPMAddPackage("gh:pybind/pybind11@3.0.0")
 
-pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp)
-target_link_libraries(_nvbench PUBLIC nvbench::nvbench)
-target_link_libraries(_nvbench PRIVATE CUDA::cudart_static)
+# Determine CUDA major version for extension naming
+string(REGEX MATCH "^([0-9]+)" CUDA_VERSION_MAJOR "${CUDAToolkit_VERSION}")
 
-set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
-set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
-set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON)
+# Allow override of extension suffix via environment variable (for multi-CUDA builds)
+if(DEFINED ENV{PYNVBENCH_CUDA_SUFFIX})
+  set(NVBENCH_EXTENSION_SUFFIX $ENV{PYNVBENCH_CUDA_SUFFIX})
+else()
+  set(NVBENCH_EXTENSION_SUFFIX "_cu${CUDA_VERSION_MAJOR}")
+endif()
 
-install(TARGETS _nvbench DESTINATION cuda/bench)
+set(NVBENCH_EXTENSION_NAME "_nvbench${NVBENCH_EXTENSION_SUFFIX}")
+message(STATUS "Building extension: ${NVBENCH_EXTENSION_NAME}")
+
+pybind11_add_module(${NVBENCH_EXTENSION_NAME} MODULE src/py_nvbench.cpp)
+target_link_libraries(${NVBENCH_EXTENSION_NAME} PUBLIC nvbench::nvbench)
+target_link_libraries(${NVBENCH_EXTENSION_NAME} PRIVATE CUDA::cudart_static)
+
+set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INSTALL_RPATH "$ORIGIN")
+set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
+set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+install(TARGETS ${NVBENCH_EXTENSION_NAME} DESTINATION cuda/bench)
 
 # Determine target that nvbench::nvbench is an alias of,
 # necessary because ALIAS targets cannot be installed
diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py
index e1d2282a..d2439fe3 100644
--- a/python/cuda/bench/__init__.py
+++ b/python/cuda/bench/__init__.py
@@ -14,6 +14,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import importlib
 import importlib.metadata
 import warnings
 
@@ -31,29 +32,57 @@
         f"Version is set to fall-back value '{__version__}' instead."
     )
 
+
+# Detect CUDA runtime version and load appropriate extension
+def _get_cuda_major_version():
+    """Detect the CUDA runtime major version."""
+    try:
+        from cuda import cuda as cuda_bindings
+
+        err, version = cuda_bindings.cuRuntimeGetVersion()
+        if err != cuda_bindings.cudaError_t.cudaSuccess:
+            raise RuntimeError(f"Failed to get CUDA runtime version: {err}")
+        # Version is encoded as (major * 1000) + (minor * 10)
+        major = version // 1000
+        return major
+    except ImportError:
+        raise ImportError(
+            "cuda-bindings is required for runtime CUDA version detection. "
+            "Install with: pip install pynvbench[cu12] or pip install pynvbench[cu13]"
+        )
+
+
+_cuda_major = _get_cuda_major_version()
+_extension_name = f"_nvbench_cu{_cuda_major}"
+
+try:
+    _nvbench_module = importlib.import_module(f"cuda.bench.{_extension_name}")
+except ImportError as e:
+    raise ImportError(
+        f"No pynvbench extension found for CUDA {_cuda_major}.x. "
+        f"This wheel may not include support for your CUDA version. "
+        f"Supported CUDA versions: 12, 13. "
+        f"Original error: {e}"
+    )
+
+# Load required NVIDIA libraries
 for libname in ("cupti", "nvperf_target", "nvperf_host"):
     load_nvidia_dynamic_lib(libname)
 
-from cuda.bench._nvbench import (  # noqa: E402
-    Benchmark as Benchmark,
-)
-from cuda.bench._nvbench import (  # noqa: E402
-    CudaStream as CudaStream,
-)
-from cuda.bench._nvbench import (  # noqa: E402
-    Launch as Launch,
-)
-from cuda.bench._nvbench import (  # noqa: E402
-    NVBenchRuntimeError as NVBenchRuntimeError,
-)
-from cuda.bench._nvbench import (  # noqa: E402
-    State as State,
-)
-from cuda.bench._nvbench import (  # noqa: E402
-    register as register,
-)
-from cuda.bench._nvbench import (  # noqa: E402
-    run_all_benchmarks as run_all_benchmarks,
-)
+# Import and expose all public symbols from the CUDA-specific extension
+Benchmark = _nvbench_module.Benchmark
+CudaStream = _nvbench_module.CudaStream
+Launch = _nvbench_module.Launch
+NVBenchRuntimeError = _nvbench_module.NVBenchRuntimeError
+State = _nvbench_module.State
+register = _nvbench_module.register
+run_all_benchmarks = _nvbench_module.run_all_benchmarks
 
-del load_nvidia_dynamic_lib
+# Clean up internal symbols
+del (
+    load_nvidia_dynamic_lib,
+    _nvbench_module,
+    _cuda_major,
+    _extension_name,
+    _get_cuda_major_version,
+)
diff --git a/python/pyproject.toml b/python/pyproject.toml
index d5356705..9d2baebb 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -22,22 +22,20 @@ classifiers = [
 ]
 requires-python = ">=3.10"
 dependencies = [
-  # pathfinder
+  # pathfinder for finding CUDA libraries
   "cuda-pathfinder",
-
-  # Library expects to find shared libraries
-  # libcupti, libnvperf_target, libnvperf_host
-  # pathfinder is used to find it in the Python layout
-  "nvidia-cuda-cupti-cu12",
-
-  # The shared library
-  # libnvidia-ml must be installed system-wide
-  # (Debian package provider: libnvidia-compute)
 ]
 dynamic = ["version"]
 readme = { file = "README.md", content-type = "text/markdown" }
 
 [project.optional-dependencies]
+# CUDA 12.x dependencies
+cu12 = ["cuda-bindings>=12.0.0,<13.0.0", "nvidia-cuda-cupti-cu12"]
+
+# CUDA 13.x dependencies
+cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti-cu13"]
+
+# Test dependencies (using CUDA 12 by default)
 test = ["pytest", "cupy-cuda12x", "numba"]
 
 [project.urls]
@@ -51,7 +49,7 @@ build-dir = "build/{wheel_tag}"
 
 [tool.scikit-build.cmake]
 version = ">=3.30.4"
-args = []
+args = ["-DNVBench_ENABLE_NVML=OFF", "-DNVBench_ENABLE_CUPTI=OFF"]
 build-type = "Release"
 source-dir = "."
 

From a0206000f4e540f5ac712e37bf95cd0bb86e8104 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 08:01:48 -0500
Subject: [PATCH 02/20] Update .gitignore

---
 .gitignore | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index a5c769c6..eec6d226 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,13 +8,6 @@ cmake-build-*
 *~
 compile_commands.json
 CMakeUserPresets.json
-<<<<<<< HEAD
-
-# Python wheel builds
-wheelhouse/
-=======
-<<<<<<< Updated upstream
-=======
 
 # Python wheel builds
 wheelhouse/
@@ -22,14 +15,9 @@ wheelhouse_merged/
 wheelhouse_final/
 dist/
 dist_*/
->>>>>>> 1ab0a2e (Add multi-cuda wheel build)
 *.whl
 *.egg-info/
 __pycache__/
 *.pyc
 *.pyo
-<<<<<<< HEAD
-=======
 .pytest_cache/
->>>>>>> Stashed changes
->>>>>>> 1ab0a2e (Add multi-cuda wheel build)

From 18ecf118643e1ee84052eb9800e68a9c5cfa9d8e Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 08:05:34 -0500
Subject: [PATCH 03/20] No need for the artifacts utilities

---
 ci/build_multi_cuda_wheel.sh | 29 +----------------------------
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh
index af0ca61e..bd5a953b 100755
--- a/ci/build_multi_cuda_wheel.sh
+++ b/ci/build_multi_cuda_wheel.sh
@@ -37,24 +37,6 @@ if [[ -z "${HOST_WORKSPACE:-}" ]]; then
   echo "Setting HOST_WORKSPACE to: $HOST_WORKSPACE"
 fi
 
-if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
-  # Prepare mount points etc for getting artifacts in/out of the container.
-  source "$ci_dir/util/artifacts/common.sh"
-  # Note that these mounts use the runner (not the devcontainer) filesystem for
-  # source directories because of docker-out-of-docker quirks.
-  # The workflow-job GH actions make sure that they exist before running any
-  # scripts.
-  action_mounts=$(cat <<EOF
-    --mount type=bind,source=${ARTIFACT_ARCHIVES},target=${ARTIFACT_ARCHIVES} \
-    --mount type=bind,source=${ARTIFACT_UPLOAD_STAGE},target=${ARTIFACT_UPLOAD_STAGE}
-EOF
-)
-
-else
-  # If not running in GitHub Actions, we don't need to set up artifact mounts.
-  action_mounts=""
-fi
-
 # pynvbench must be built in a container that can produce manylinux wheels,
 # and has the CUDA toolkit installed. We use the rapidsai/ci-wheel image for this.
 # We build separate wheels using separate containers for each CUDA version,
@@ -84,11 +66,7 @@ for ctk in 12 13; do
     docker run --rm -i \
         --workdir /workspace/python \
         --mount type=bind,source=${HOST_WORKSPACE},target=/workspace/ \
-        ${action_mounts} \
         --env py_version=${py_version} \
-        --env GITHUB_ACTIONS=${GITHUB_ACTIONS:-} \
-        --env GITHUB_RUN_ID=${GITHUB_RUN_ID:-} \
-        --env JOB_ID=${JOB_ID:-} \
         $image \
         /workspace/ci/build_pynvbench_wheel_for_cuda.sh
     # Prevent GHA runners from exhausting available storage with leftover images:
@@ -137,7 +115,7 @@ echo "Found CUDA 13 wheel: $cu13_wheel"
 cu12_wheel=$(readlink -f "$cu12_wheel")
 cu13_wheel=$(readlink -f "$cu13_wheel")
 
-# Merge the wheels manually (similar to CCCL)
+# Merge the wheels manually
 mkdir -p wheelhouse_merged
 cd wheelhouse_merged
 
@@ -192,8 +170,3 @@ rm -rf wheelhouse_merged wheelhouse_final
 
 echo "Final wheels in wheelhouse:"
 ls -la wheelhouse/
-
-if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
-  wheel_artifact_name="$(ci/util/workflow/get_wheel_artifact_name.sh)"
-  ci/util/artifacts/upload.sh $wheel_artifact_name 'wheelhouse/.*'
-fi

From b62956b5004f0221f38c9f8ff903e37e11c12c7d Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 10:03:26 -0500
Subject: [PATCH 04/20] Updates to correctly handle multi-CUDA wheels

---
 ci/test_pynvbench.sh          |  4 ++--
 ci/test_pynvbench_inner.sh    | 27 +++++++++------------------
 python/CMakeLists.txt         |  3 ++-
 python/cuda/bench/__init__.py | 14 ++++++++------
 python/src/py_nvbench.cpp     |  6 +++++-
 5 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/ci/test_pynvbench.sh b/ci/test_pynvbench.sh
index ec3e4573..53451515 100755
--- a/ci/test_pynvbench.sh
+++ b/ci/test_pynvbench.sh
@@ -43,10 +43,10 @@ fi
 # Map cuda_version to full version and set CUDA extra
 if [[ "$cuda_version" == "12" ]]; then
     cuda_full_version="12.9.1"
-    cuda_extra="[cu12]"
+    cuda_extra="cu12"
 elif [[ "$cuda_version" == "13" ]]; then
     cuda_full_version="13.0.1"
-    cuda_extra="[cu13]"
+    cuda_extra="cu13"
 else
     echo "Error: Unsupported CUDA version: $cuda_version"
     exit 1
diff --git a/ci/test_pynvbench_inner.sh b/ci/test_pynvbench_inner.sh
index 02d8ee0b..cc0e7527 100755
--- a/ci/test_pynvbench_inner.sh
+++ b/ci/test_pynvbench_inner.sh
@@ -24,34 +24,25 @@ echo "CUDA version: $(nvcc --version | grep release)"
 # Wheel should be in /workspace/wheelhouse (downloaded by workflow or built locally)
 WHEELHOUSE_DIR="/workspace/wheelhouse"
 
-<<<<<<< HEAD
-# Find and install pynvbench wheel
-# Look for .cu${cuda_version} in the version string (e.g., pynvbench-0.0.1.dev1+g123.cu12-...)
-PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*.cu${cuda_version}-*.whl 2>/dev/null | head -1)"
+# Find the pynvbench wheel (multi-CUDA wheel)
+# Prefer manylinux wheels, fall back to any wheel
+PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*manylinux*.whl 2>/dev/null | head -1)"
 if [[ -z "$PYNVBENCH_WHEEL_PATH" ]]; then
-    echo "Error: No pynvbench wheel found in ${WHEELHOUSE_DIR}"
-    echo "Looking for: pynvbench-*.cu${cuda_version}-*.whl"
-=======
-# Find the pynvbench wheel (multi-CUDA wheel, no CUDA version in filename)
-PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*.whl 2>/dev/null | head -1)"
+    PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*.whl 2>/dev/null | head -1)"
+fi
+
 if [[ -z "$PYNVBENCH_WHEEL_PATH" ]]; then
     echo "Error: No pynvbench wheel found in ${WHEELHOUSE_DIR}"
->>>>>>> 1ab0a2e (Add multi-cuda wheel build)
     echo "Contents of ${WHEELHOUSE_DIR}:"
     ls -la ${WHEELHOUSE_DIR}/ || true
     exit 1
 fi
 
-<<<<<<< HEAD
-echo "Installing wheel: $PYNVBENCH_WHEEL_PATH"
-python -m pip install "${PYNVBENCH_WHEEL_PATH}[test]"
-=======
 # Determine which CUDA extra to install (defaults to cu12 if not specified)
-CUDA_EXTRA="${cuda_extra:-[cu${cuda_version}]}"
+CUDA_EXTRA="${cuda_extra:-cu${cuda_version}}"
 
-echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${CUDA_EXTRA}"
-python -m pip install "${PYNVBENCH_WHEEL_PATH}${CUDA_EXTRA},test"
->>>>>>> 1ab0a2e (Add multi-cuda wheel build)
+echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${CUDA_EXTRA},test"
+python -m pip install "${PYNVBENCH_WHEEL_PATH}[${CUDA_EXTRA},test]"
 
 # Run tests
 cd "/workspace/python/test/"
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index c0dda5a0..20cd71cc 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -39,8 +39,9 @@ set(NVBENCH_EXTENSION_NAME "_nvbench${NVBENCH_EXTENSION_SUFFIX}")
 message(STATUS "Building extension: ${NVBENCH_EXTENSION_NAME}")
 
 pybind11_add_module(${NVBENCH_EXTENSION_NAME} MODULE src/py_nvbench.cpp)
+target_compile_definitions(${NVBENCH_EXTENSION_NAME} PRIVATE PYBIND11_MODULE_NAME=${NVBENCH_EXTENSION_NAME})
 target_link_libraries(${NVBENCH_EXTENSION_NAME} PUBLIC nvbench::nvbench)
-target_link_libraries(${NVBENCH_EXTENSION_NAME} PRIVATE CUDA::cudart_static)
+target_link_libraries(${NVBENCH_EXTENSION_NAME} PRIVATE CUDA::cudart_static CUDA::cuda_driver)
 
 set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INSTALL_RPATH "$ORIGIN")
 set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py
index d2439fe3..9c8a35ce 100644
--- a/python/cuda/bench/__init__.py
+++ b/python/cuda/bench/__init__.py
@@ -37,13 +37,12 @@
 def _get_cuda_major_version():
     """Detect the CUDA runtime major version."""
     try:
-        from cuda import cuda as cuda_bindings
+        import cuda.bindings
 
-        err, version = cuda_bindings.cuRuntimeGetVersion()
-        if err != cuda_bindings.cudaError_t.cudaSuccess:
-            raise RuntimeError(f"Failed to get CUDA runtime version: {err}")
-        # Version is encoded as (major * 1000) + (minor * 10)
-        major = version // 1000
+        # Get CUDA version from cuda-bindings package version
+        # cuda-bindings version is in format like "12.9.1" or "13.0.0"
+        version_str = cuda.bindings.__version__
+        major = int(version_str.split(".")[0])
         return major
     except ImportError:
         raise ImportError(
@@ -78,6 +77,9 @@ def _get_cuda_major_version():
 register = _nvbench_module.register
 run_all_benchmarks = _nvbench_module.run_all_benchmarks
 
+# Expose the module as _nvbench for backward compatibility (e.g., for tests)
+_nvbench = _nvbench_module
+
 # Clean up internal symbols
 del (
     load_nvidia_dynamic_lib,
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 8856e8e7..a878f5ec 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -230,7 +230,11 @@ std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry{};
 // If you modify these bindings, please be sure to update the
 // corresponding type hints in ``../cuda/nvbench/__init__.pyi``
 
-PYBIND11_MODULE(_nvbench, m)
+#ifndef PYBIND11_MODULE_NAME
+#define PYBIND11_MODULE_NAME _nvbench
+#endif
+
+PYBIND11_MODULE(PYBIND11_MODULE_NAME, m)
 {
   // == STEP 1
   // Set environment variable CUDA_MODULE_LOADING=EAGER

From f03d18ad9ac1c21d1bceb64d44ada7474f164785 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 10:30:32 -0500
Subject: [PATCH 05/20] Small fix

---
 .github/workflows/build-and-test-python-wheels.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml
index 377e0d6a..c58df68c 100644
--- a/.github/workflows/build-and-test-python-wheels.yml
+++ b/.github/workflows/build-and-test-python-wheels.yml
@@ -76,10 +76,10 @@ jobs:
           # Use the same rapidsai/ci-wheel Docker image as build
           if [[ "${{ matrix.cuda }}" == "12" ]]; then
             cuda_full_version="12.9.1"
-            cuda_extra="[cu12]"
+            cuda_extra="cu12"
           else
             cuda_full_version="13.0.1"
-            cuda_extra="[cu13]"
+            cuda_extra="cu13"
           fi
 
           docker run --rm \

From edd31c764d32c3608c0a1bc1339fd335181017b8 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 10:34:20 -0500
Subject: [PATCH 06/20] Reduce CI matrix

---
 .../build-and-test-python-wheels.yml          |   6 +-
 .github/workflows/build-wheels.yml            |   2 +-
 .github/workflows/pr.yml                      | 102 +++++++++---------
 3 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml
index c58df68c..f856c650 100644
--- a/.github/workflows/build-and-test-python-wheels.yml
+++ b/.github/workflows/build-and-test-python-wheels.yml
@@ -19,7 +19,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python: ['3.10', '3.11', '3.12', '3.13']
+        python: ['3.13']
 
     steps:
       - name: Checkout repository
@@ -55,8 +55,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda: ['12', '13']
-        python: ['3.10', '3.11', '3.12', '3.13']
+        cuda: ['13']
+        python: ['3.13']
 
     steps:
       - name: Checkout repository
diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml
index 4ae97491..993e60fa 100644
--- a/.github/workflows/build-wheels.yml
+++ b/.github/workflows/build-wheels.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python: ['3.10', '3.11', '3.12', '3.13']
+        python: ['3.13']
 
     steps:
       - name: Checkout repository
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index c42cda01..3653f510 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -35,46 +35,46 @@ permissions:
   pull-requests: read
 
 jobs:
-  compute-matrix:
-    name: Compute matrix
-    runs-on: ubuntu-latest
-    outputs:
-      DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
-      PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
-      PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
-      base_sha: ${{ steps.export-pr-info.outputs.base_sha }}
-      pr_number: ${{ steps.export-pr-info.outputs.pr_number }}
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v4
-      - name: Lookup PR info
-        id: get-pr-info
-        uses: nv-gha-runners/get-pr-info@main
-      - name: Export PR info
-        id: export-pr-info
-        run: |
-          echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}"
-          echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}"
-      - name: Compute matrix outputs
-        id: set-outputs
-        run: |
-          .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
+  # compute-matrix:
+  #   name: Compute matrix
+  #   runs-on: ubuntu-latest
+  #   outputs:
+  #     DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
+  #     PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
+  #     PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
+  #     base_sha: ${{ steps.export-pr-info.outputs.base_sha }}
+  #     pr_number: ${{ steps.export-pr-info.outputs.pr_number }}
+  #   steps:
+  #     - name: Checkout repo
+  #       uses: actions/checkout@v4
+  #     - name: Lookup PR info
+  #       id: get-pr-info
+  #       uses: nv-gha-runners/get-pr-info@main
+  #     - name: Export PR info
+  #       id: export-pr-info
+  #       run: |
+  #         echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}"
+  #         echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}"
+  #     - name: Compute matrix outputs
+  #       id: set-outputs
+  #       run: |
+  #         .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
 
-  nvbench:
-    name: NVBench CUDA${{ matrix.cuda_host_combination }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: compute-matrix
-    uses: ./.github/workflows/dispatch-build-and-test.yml
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
-    with:
-      project_name: "nvbench"
-      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
-      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+  # nvbench:
+  #   name: NVBench CUDA${{ matrix.cuda_host_combination }}
+  #   permissions:
+  #     id-token: write
+  #     contents: read
+  #   needs: compute-matrix
+  #   uses: ./.github/workflows/dispatch-build-and-test.yml
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
+  #   with:
+  #     project_name: "nvbench"
+  #     per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
+  #     devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
 
   python-wheels:
     name: Python Wheels
@@ -83,16 +83,16 @@ jobs:
       contents: read
     uses: ./.github/workflows/build-and-test-python-wheels.yml
 
-  verify-devcontainers:
-    name: Verify Dev Containers
-    if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}
-    needs: compute-matrix
-    permissions:
-      id-token: write
-      contents: read
-    uses: ./.github/workflows/verify-devcontainers.yml
-    with:
-      base_sha: ${{ needs.compute-matrix.outputs.base_sha }}
+  # verify-devcontainers:
+  #   name: Verify Dev Containers
+  #   if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}
+  #   needs: compute-matrix
+  #   permissions:
+  #     id-token: write
+  #     contents: read
+  #   uses: ./.github/workflows/verify-devcontainers.yml
+  #   with:
+  #     base_sha: ${{ needs.compute-matrix.outputs.base_sha }}
 
   # This job is the final job that runs after all other jobs and is used for branch protection status checks.
   # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
@@ -102,9 +102,9 @@ jobs:
     name: CI
     if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
     needs:
-      - nvbench
+      # - nvbench
       - python-wheels
-      - verify-devcontainers
+      # - verify-devcontainers
     steps:
       - name: Check status of all precursor jobs
         if: >-

From d178a089324e0aac866ecdf7a579bb8626f4d290 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 10:54:35 -0500
Subject: [PATCH 07/20] =?UTF-8?q?Special=20handling=20for=20nvidia-cuda-cu?=
 =?UTF-8?q?pti=20for=20cuda=2013=20=F0=9F=A4=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index 9d2baebb..f238fa6b 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -33,7 +33,7 @@ readme = { file = "README.md", content-type = "text/markdown" }
 cu12 = ["cuda-bindings>=12.0.0,<13.0.0", "nvidia-cuda-cupti-cu12"]
 
 # CUDA 13.x dependencies
-cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti-cu13"]
+cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti>=13.0"]
 
 # Test dependencies (using CUDA 12 by default)
 test = ["pytest", "cupy-cuda12x", "numba"]

From fd2de8c9b0ddf5a6babfc79ca73b1ffcfd47a60d Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 12:36:32 -0500
Subject: [PATCH 08/20] Fix issue causing segfault

---
 ci/build_multi_cuda_wheel.sh | 12 +++++++++++-
 ci/test_pynvbench_inner.sh   |  5 +++--
 python/pyproject.toml        |  8 +++++++-
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh
index bd5a953b..fe7d4c9e 100755
--- a/ci/build_multi_cuda_wheel.sh
+++ b/ci/build_multi_cuda_wheel.sh
@@ -123,14 +123,24 @@ cd wheelhouse_merged
 $PYTHON -m wheel unpack "$cu12_wheel"
 base_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1)
 
+# Rename libnvbench.so to libnvbench_cu12.so in the base (CUDA 12) wheel
+mv "$base_dir/cuda/bench/libnvbench.so" "$base_dir/cuda/bench/libnvbench_cu12.so"
+
+# Update _nvbench_cu12.so to link to libnvbench_cu12.so instead of libnvbench.so
+patchelf --replace-needed libnvbench.so libnvbench_cu12.so "$base_dir"/cuda/bench/_nvbench_cu12*.so
+
 # Unpack CUDA 13 wheel into a temporary subdirectory
 mkdir cu13_tmp
 cd cu13_tmp
 $PYTHON -m wheel unpack "$cu13_wheel"
 cu13_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1)
 
-# Copy the CUDA 13 extension into the base wheel
+# Copy the CUDA 13 extension AND library into the base wheel
 cp "$cu13_dir"/cuda/bench/_nvbench_cu13*.so "../$base_dir/cuda/bench/"
+cp "$cu13_dir"/cuda/bench/libnvbench.so "../$base_dir/cuda/bench/libnvbench_cu13.so"
+
+# Update _nvbench_cu13.so to link to libnvbench_cu13.so instead of libnvbench.so
+patchelf --replace-needed libnvbench.so libnvbench_cu13.so "../$base_dir"/cuda/bench/_nvbench_cu13*.so
 
 # Go back and clean up
 cd ..
diff --git a/ci/test_pynvbench_inner.sh b/ci/test_pynvbench_inner.sh
index cc0e7527..4a7bd1c1 100755
--- a/ci/test_pynvbench_inner.sh
+++ b/ci/test_pynvbench_inner.sh
@@ -40,9 +40,10 @@ fi
 
 # Determine which CUDA extra to install (defaults to cu12 if not specified)
 CUDA_EXTRA="${cuda_extra:-cu${cuda_version}}"
+TEST_EXTRA="test-cu${cuda_version}"
 
-echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${CUDA_EXTRA},test"
-python -m pip install "${PYNVBENCH_WHEEL_PATH}[${CUDA_EXTRA},test]"
+echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${CUDA_EXTRA},${TEST_EXTRA}"
+python -m pip install "${PYNVBENCH_WHEEL_PATH}[${CUDA_EXTRA},${TEST_EXTRA}]"
 
 # Run tests
 cd "/workspace/python/test/"
diff --git a/python/pyproject.toml b/python/pyproject.toml
index f238fa6b..08161210 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -35,7 +35,13 @@ cu12 = ["cuda-bindings>=12.0.0,<13.0.0", "nvidia-cuda-cupti-cu12"]
 # CUDA 13.x dependencies
 cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti>=13.0"]
 
-# Test dependencies (using CUDA 12 by default)
+# Test dependencies for CUDA 12
+test-cu12 = ["pynvbench[cu12]", "pytest", "cupy-cuda12x", "numba"]
+
+# Test dependencies for CUDA 13
+test-cu13 = ["pynvbench[cu13]", "pytest", "cupy-cuda13x", "numba"]
+
+# Generic test dependencies (defaults to CUDA 12)
 test = ["pytest", "cupy-cuda12x", "numba"]
 
 [project.urls]

From 1e2ef0fe04fb89be49363c9313cfeaf2aa2c23bc Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 13:17:13 -0500
Subject: [PATCH 09/20] Revert "Reduce CI matrix"

This reverts commit edd31c764d32c3608c0a1bc1339fd335181017b8.
---
 .../build-and-test-python-wheels.yml          |   6 +-
 .github/workflows/build-wheels.yml            |   2 +-
 .github/workflows/pr.yml                      | 102 +++++++++---------
 3 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml
index f856c650..c58df68c 100644
--- a/.github/workflows/build-and-test-python-wheels.yml
+++ b/.github/workflows/build-and-test-python-wheels.yml
@@ -19,7 +19,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python: ['3.13']
+        python: ['3.10', '3.11', '3.12', '3.13']
 
     steps:
       - name: Checkout repository
@@ -55,8 +55,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda: ['13']
-        python: ['3.13']
+        cuda: ['12', '13']
+        python: ['3.10', '3.11', '3.12', '3.13']
 
     steps:
       - name: Checkout repository
diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml
index 993e60fa..4ae97491 100644
--- a/.github/workflows/build-wheels.yml
+++ b/.github/workflows/build-wheels.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python: ['3.13']
+        python: ['3.10', '3.11', '3.12', '3.13']
 
     steps:
       - name: Checkout repository
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 3653f510..c42cda01 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -35,46 +35,46 @@ permissions:
   pull-requests: read
 
 jobs:
-  # compute-matrix:
-  #   name: Compute matrix
-  #   runs-on: ubuntu-latest
-  #   outputs:
-  #     DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
-  #     PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
-  #     PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
-  #     base_sha: ${{ steps.export-pr-info.outputs.base_sha }}
-  #     pr_number: ${{ steps.export-pr-info.outputs.pr_number }}
-  #   steps:
-  #     - name: Checkout repo
-  #       uses: actions/checkout@v4
-  #     - name: Lookup PR info
-  #       id: get-pr-info
-  #       uses: nv-gha-runners/get-pr-info@main
-  #     - name: Export PR info
-  #       id: export-pr-info
-  #       run: |
-  #         echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}"
-  #         echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}"
-  #     - name: Compute matrix outputs
-  #       id: set-outputs
-  #       run: |
-  #         .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
+  compute-matrix:
+    name: Compute matrix
+    runs-on: ubuntu-latest
+    outputs:
+      DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
+      PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
+      PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
+      base_sha: ${{ steps.export-pr-info.outputs.base_sha }}
+      pr_number: ${{ steps.export-pr-info.outputs.pr_number }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+      - name: Lookup PR info
+        id: get-pr-info
+        uses: nv-gha-runners/get-pr-info@main
+      - name: Export PR info
+        id: export-pr-info
+        run: |
+          echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}"
+          echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}"
+      - name: Compute matrix outputs
+        id: set-outputs
+        run: |
+          .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
 
-  # nvbench:
-  #   name: NVBench CUDA${{ matrix.cuda_host_combination }}
-  #   permissions:
-  #     id-token: write
-  #     contents: read
-  #   needs: compute-matrix
-  #   uses: ./.github/workflows/dispatch-build-and-test.yml
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
-  #   with:
-  #     project_name: "nvbench"
-  #     per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
-  #     devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+  nvbench:
+    name: NVBench CUDA${{ matrix.cuda_host_combination }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: compute-matrix
+    uses: ./.github/workflows/dispatch-build-and-test.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
+    with:
+      project_name: "nvbench"
+      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
+      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
 
   python-wheels:
     name: Python Wheels
@@ -83,16 +83,16 @@ jobs:
       contents: read
     uses: ./.github/workflows/build-and-test-python-wheels.yml
 
-  # verify-devcontainers:
-  #   name: Verify Dev Containers
-  #   if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}
-  #   needs: compute-matrix
-  #   permissions:
-  #     id-token: write
-  #     contents: read
-  #   uses: ./.github/workflows/verify-devcontainers.yml
-  #   with:
-  #     base_sha: ${{ needs.compute-matrix.outputs.base_sha }}
+  verify-devcontainers:
+    name: Verify Dev Containers
+    if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}
+    needs: compute-matrix
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/verify-devcontainers.yml
+    with:
+      base_sha: ${{ needs.compute-matrix.outputs.base_sha }}
 
   # This job is the final job that runs after all other jobs and is used for branch protection status checks.
   # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
@@ -102,9 +102,9 @@ jobs:
     name: CI
     if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
     needs:
-      # - nvbench
+      - nvbench
       - python-wheels
-      # - verify-devcontainers
+      - verify-devcontainers
     steps:
       - name: Check status of all precursor jobs
         if: >-

From d54f264f23f350f805d3eed5eefce96e4ec7be6c Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 16:08:27 -0500
Subject: [PATCH 10/20] Try to simplify the approach

---
 ci/build_multi_cuda_wheel.sh         | 31 ++++++++++++++--------------
 ci/build_pynvbench_wheel_for_cuda.sh | 15 +-------------
 python/CMakeLists.txt                | 18 ++++++----------
 python/cuda/bench/__init__.py        | 10 ++++++---
 4 files changed, 29 insertions(+), 45 deletions(-)

diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh
index fe7d4c9e..a91c77fb 100755
--- a/ci/build_multi_cuda_wheel.sh
+++ b/ci/build_multi_cuda_wheel.sh
@@ -92,9 +92,9 @@ fi
 # Needed for unpacking and repacking wheels.
 $PYTHON -m pip install --break-system-packages wheel
 
-# Find the built wheels
-cu12_wheel=$(find wheelhouse -name "*cu12*.whl" | head -1)
-cu13_wheel=$(find wheelhouse -name "*cu13*.whl" | head -1)
+# Find the built wheels (they no longer have cu12/cu13 suffix in the name)
+cu12_wheel=$(find wheelhouse -name "pynvbench-*.whl" | head -1)
+cu13_wheel=$(find wheelhouse -name "pynvbench-*.whl" | tail -1)
 
 if [[ -z "$cu12_wheel" ]]; then
   echo "Error: CUDA 12 wheel not found in wheelhouse/"
@@ -108,6 +108,12 @@ if [[ -z "$cu13_wheel" ]]; then
   exit 1
 fi
 
+if [[ "$cu12_wheel" == "$cu13_wheel" ]]; then
+  echo "Error: Only one wheel found, expected two (CUDA 12 and CUDA 13)"
+  ls -la wheelhouse/
+  exit 1
+fi
+
 echo "Found CUDA 12 wheel: $cu12_wheel"
 echo "Found CUDA 13 wheel: $cu13_wheel"
 
@@ -123,36 +129,29 @@ cd wheelhouse_merged
 $PYTHON -m wheel unpack "$cu12_wheel"
 base_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1)
 
-# Rename libnvbench.so to libnvbench_cu12.so in the base (CUDA 12) wheel
-mv "$base_dir/cuda/bench/libnvbench.so" "$base_dir/cuda/bench/libnvbench_cu12.so"
-
-# Update _nvbench_cu12.so to link to libnvbench_cu12.so instead of libnvbench.so
-patchelf --replace-needed libnvbench.so libnvbench_cu12.so "$base_dir"/cuda/bench/_nvbench_cu12*.so
-
 # Unpack CUDA 13 wheel into a temporary subdirectory
 mkdir cu13_tmp
 cd cu13_tmp
 $PYTHON -m wheel unpack "$cu13_wheel"
 cu13_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1)
 
-# Copy the CUDA 13 extension AND library into the base wheel
-cp "$cu13_dir"/cuda/bench/_nvbench_cu13*.so "../$base_dir/cuda/bench/"
-cp "$cu13_dir"/cuda/bench/libnvbench.so "../$base_dir/cuda/bench/libnvbench_cu13.so"
-
-# Update _nvbench_cu13.so to link to libnvbench_cu13.so instead of libnvbench.so
-patchelf --replace-needed libnvbench.so libnvbench_cu13.so "../$base_dir"/cuda/bench/_nvbench_cu13*.so
+# Copy the cu13/ directory from CUDA 13 wheel into the base wheel
+cp -r "$cu13_dir"/cuda/bench/cu13 "../$base_dir/cuda/bench/"
 
 # Go back and clean up
 cd ..
 rm -rf cu13_tmp
 
+# Remove RECORD file to let wheel recreate it
+rm -f "$base_dir"/*.dist-info/RECORD
+
 # Repack the merged wheel
 $PYTHON -m wheel pack "$base_dir"
 
 cd ..
 
 # Install auditwheel and repair the merged wheel
-$PYTHON -m pip install --break-system-packages patchelf auditwheel
+$PYTHON -m pip install --break-system-packages auditwheel
 for wheel in wheelhouse_merged/pynvbench-*.whl; do
     echo "Repairing merged wheel: $wheel"
     $PYTHON -m auditwheel repair \
diff --git a/ci/build_pynvbench_wheel_for_cuda.sh b/ci/build_pynvbench_wheel_for_cuda.sh
index 0e3d4b7d..79ebbd32 100755
--- a/ci/build_pynvbench_wheel_for_cuda.sh
+++ b/ci/build_pynvbench_wheel_for_cuda.sh
@@ -56,22 +56,9 @@ export CXX="$(which g++)"
 export CUDACXX="$(which nvcc)"
 export CUDAHOSTCXX="$(which g++)"
 
-# Set CUDA suffix for extension naming
-export PYNVBENCH_CUDA_SUFFIX="_cu${cuda_version}"
-
 # Build the wheel
 python -m pip wheel --no-deps --verbose --wheel-dir dist .
 
-# Rename wheel to include CUDA version suffix
-for wheel in dist/pynvbench-*.whl; do
-    if [[ -f "$wheel" ]]; then
-        base_name=$(basename "$wheel" .whl)
-        new_name="${base_name}.cu${cuda_version}.whl"
-        mv "$wheel" "dist/${new_name}"
-        echo "Renamed wheel to: ${new_name}"
-    fi
-done
-
 # Move wheel to output directory
 mkdir -p /workspace/wheelhouse
-mv dist/pynvbench-*.cu*.whl /workspace/wheelhouse/
+mv dist/pynvbench-*.whl /workspace/wheelhouse/
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 20cd71cc..e7c0c776 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -25,18 +25,12 @@ CPMAddPackage(
 
 CPMAddPackage("gh:pybind/pybind11@3.0.0")
 
-# Determine CUDA major version for extension naming
+# Determine CUDA major version for extension naming and directory structure
 string(REGEX MATCH "^([0-9]+)" CUDA_VERSION_MAJOR "${CUDAToolkit_VERSION}")
+set(CUDA_VERSION_DIR "cu${CUDA_VERSION_MAJOR}")
 
-# Allow override of extension suffix via environment variable (for multi-CUDA builds)
-if(DEFINED ENV{PYNVBENCH_CUDA_SUFFIX})
-  set(NVBENCH_EXTENSION_SUFFIX $ENV{PYNVBENCH_CUDA_SUFFIX})
-else()
-  set(NVBENCH_EXTENSION_SUFFIX "_cu${CUDA_VERSION_MAJOR}")
-endif()
-
-set(NVBENCH_EXTENSION_NAME "_nvbench${NVBENCH_EXTENSION_SUFFIX}")
-message(STATUS "Building extension: ${NVBENCH_EXTENSION_NAME}")
+set(NVBENCH_EXTENSION_NAME "_nvbench_cu${CUDA_VERSION_MAJOR}")
+message(STATUS "Building extension: ${NVBENCH_EXTENSION_NAME} for CUDA ${CUDA_VERSION_MAJOR}, output directory: ${CUDA_VERSION_DIR}")
 
 pybind11_add_module(${NVBENCH_EXTENSION_NAME} MODULE src/py_nvbench.cpp)
 target_compile_definitions(${NVBENCH_EXTENSION_NAME} PRIVATE PYBIND11_MODULE_NAME=${NVBENCH_EXTENSION_NAME})
@@ -47,9 +41,9 @@ set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INSTALL_RPATH "$ORIGI
 set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
 set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
-install(TARGETS ${NVBENCH_EXTENSION_NAME} DESTINATION cuda/bench)
+install(TARGETS ${NVBENCH_EXTENSION_NAME} DESTINATION cuda/bench/${CUDA_VERSION_DIR})
 
 # Determine target that nvbench::nvbench is an alias of,
 # necessary because ALIAS targets cannot be installed
 get_target_property(_aliased_target_name nvbench::nvbench ALIASED_TARGET)
-install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench)
+install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench/${CUDA_VERSION_DIR})
diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py
index 9c8a35ce..0e8ce77d 100644
--- a/python/cuda/bench/__init__.py
+++ b/python/cuda/bench/__init__.py
@@ -52,10 +52,11 @@ def _get_cuda_major_version():
 
 
 _cuda_major = _get_cuda_major_version()
-_extension_name = f"_nvbench_cu{_cuda_major}"
+_extra_name = f"cu{_cuda_major}"
+_module_fullname = f"cuda.bench.{_extra_name}._nvbench_cu{_cuda_major}"
 
 try:
-    _nvbench_module = importlib.import_module(f"cuda.bench.{_extension_name}")
+    _nvbench_module = importlib.import_module(_module_fullname)
 except ImportError as e:
     raise ImportError(
         f"No pynvbench extension found for CUDA {_cuda_major}.x. "
@@ -76,6 +77,8 @@ def _get_cuda_major_version():
 State = _nvbench_module.State
 register = _nvbench_module.register
 run_all_benchmarks = _nvbench_module.run_all_benchmarks
+test_cpp_exception = _nvbench_module.test_cpp_exception
+test_py_exception = _nvbench_module.test_py_exception
 
 # Expose the module as _nvbench for backward compatibility (e.g., for tests)
 _nvbench = _nvbench_module
@@ -85,6 +88,7 @@ def _get_cuda_major_version():
     load_nvidia_dynamic_lib,
     _nvbench_module,
     _cuda_major,
-    _extension_name,
+    _extra_name,
+    _module_fullname,
     _get_cuda_major_version,
 )

From b63f44aba69861af8834797dc885029fb71a9327 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 16:40:44 -0500
Subject: [PATCH 11/20] More simplifications

---
 .../build-and-test-python-wheels.yml          |  1 -
 .github/workflows/build-wheels.yml            | 57 ------------
 ci/build_multi_cuda_wheel.sh                  | 12 ++-
 ci/build_pynvbench_wheel.sh                   | 90 -------------------
 ci/build_pynvbench_wheel_for_cuda.sh          | 27 +++---
 ci/build_pynvbench_wheel_inner.sh             | 89 ------------------
 python/CMakeLists.txt                         |  7 +-
 python/cuda/bench/__init__.py                 |  2 +-
 8 files changed, 31 insertions(+), 254 deletions(-)
 delete mode 100644 .github/workflows/build-wheels.yml
 delete mode 100755 ci/build_pynvbench_wheel.sh
 delete mode 100755 ci/build_pynvbench_wheel_inner.sh

diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml
index c58df68c..46f391bc 100644
--- a/.github/workflows/build-and-test-python-wheels.yml
+++ b/.github/workflows/build-and-test-python-wheels.yml
@@ -9,7 +9,6 @@ defaults:
     shell: bash --noprofile --norc -euo pipefail {0}
 
 jobs:
-  # Build multi-CUDA wheels (one wheel per Python version, includes both CUDA 12 & 13)
   build-wheels:
     name: Build wheel (Python ${{ matrix.python }})
     runs-on: ubuntu-latest
diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml
deleted file mode 100644
index 4ae97491..00000000
--- a/.github/workflows/build-wheels.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: Build Python Wheels (Manual)
-
-on:
-  workflow_dispatch:
-  workflow_call:
-
-defaults:
-  run:
-    shell: bash --noprofile --norc -euo pipefail {0}
-
-jobs:
-  build-wheels:
-    name: Build wheel (Python ${{ matrix.python }})
-    runs-on: ubuntu-latest
-    permissions:
-      id-token: write
-      contents: read
-    strategy:
-      fail-fast: false
-      matrix:
-        python: ['3.10', '3.11', '3.12', '3.13']
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          persist-credentials: false
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Build multi-CUDA wheel
-        run: |
-          bash ci/build_multi_cuda_wheel.sh -py-version ${{ matrix.python }}
-
-      - name: Upload wheel artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: wheel-pynvbench-py${{ matrix.python }}
-          path: wheelhouse/*.whl
-          retention-days: 7
-          if-no-files-found: error
-
-  verify-wheels:
-    name: Verify all wheels built successfully
-    if: ${{ always() }}
-    needs: build-wheels
-    runs-on: ubuntu-latest
-    steps:
-      - name: Check build results
-        run: |
-          if [[ "${{ needs.build-wheels.result }}" != "success" ]]; then
-            echo "Wheel builds failed!"
-            exit 1
-          fi
-          echo "All wheels built successfully!"
diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh
index a91c77fb..a25cf39f 100755
--- a/ci/build_multi_cuda_wheel.sh
+++ b/ci/build_multi_cuda_wheel.sh
@@ -16,6 +16,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Build a multi-CUDA wheel for the given Python version
+# This builds separate wheels for each supported CUDA major version,
+# and then merges them into a single wheel containing extensions
+# for all CUDA versions. At runtime, depending on the installed CUDA version,
+# the correct extension will be chosen.
+
 set -euo pipefail
 
 ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -92,9 +98,9 @@ fi
 # Needed for unpacking and repacking wheels.
 $PYTHON -m pip install --break-system-packages wheel
 
-# Find the built wheels (they no longer have cu12/cu13 suffix in the name)
-cu12_wheel=$(find wheelhouse -name "pynvbench-*.whl" | head -1)
-cu13_wheel=$(find wheelhouse -name "pynvbench-*.whl" | tail -1)
+# Find the built wheels (temporarily suffixed with .cu12/.cu13 to avoid collision)
+cu12_wheel=$(find wheelhouse -name "*cu12*.whl" | head -1)
+cu13_wheel=$(find wheelhouse -name "*cu13*.whl" | head -1)
 
 if [[ -z "$cu12_wheel" ]]; then
   echo "Error: CUDA 12 wheel not found in wheelhouse/"
diff --git a/ci/build_pynvbench_wheel.sh b/ci/build_pynvbench_wheel.sh
deleted file mode 100755
index 306891be..00000000
--- a/ci/build_pynvbench_wheel.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-usage="Usage: $0 -py-version <python_version> -cuda-version <cuda_version>"
-
-source "$ci_dir/util/python/common_arg_parser.sh"
-
-# Parse arguments including CUDA version
-parse_python_args "$@"
-
-# Now parse CUDA version from remaining arguments
-cuda_version=""
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        -cuda-version=*)
-            cuda_version="${1#*=}"
-            shift
-            ;;
-        -cuda-version)
-            if [[ $# -lt 2 ]]; then
-                echo "Error: -cuda-version requires a value" >&2
-                exit 1
-            fi
-            cuda_version="$2"
-            shift 2
-            ;;
-        *)
-            shift
-            ;;
-    esac
-done
-
-# Check if py_version was provided (this script requires it)
-require_py_version "$usage" || exit 1
-
-if [[ -z "$cuda_version" ]]; then
-    echo "Error: -cuda-version is required"
-    echo "$usage"
-    exit 1
-fi
-
-echo "Docker socket: " $(ls /var/run/docker.sock)
-
-# Map cuda_version to full version
-if [[ "$cuda_version" == "12" ]]; then
-    cuda_full_version="12.9.1"
-    cuda_short="cu12"
-elif [[ "$cuda_version" == "13" ]]; then
-    cuda_full_version="13.0.1"
-    cuda_short="cu13"
-else
-    echo "Error: Unsupported CUDA version: $cuda_version"
-    exit 1
-fi
-
-# pynvbench must be built in a container that can produce manylinux wheels,
-# and has the CUDA toolkit installed. We use the rapidsai/ci-wheel image for this.
-readonly devcontainer_version=25.12
-readonly devcontainer_distro=rockylinux8
-
-if [[ "$(uname -m)" == "aarch64" ]]; then
-  readonly cuda_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda_full_version}-${devcontainer_distro}-py${py_version}-arm64
-else
-  readonly cuda_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda_full_version}-${devcontainer_distro}-py${py_version}
-fi
-
-mkdir -p wheelhouse
-
-echo "::group::⚒️ Building CUDA ${cuda_version} wheel on ${cuda_image}"
-(
-  set -x
-  docker pull $cuda_image
-  docker run --rm -i \
-      --workdir /workspace/python \
-      --mount type=bind,source=$(pwd),target=/workspace/ \
-      --env py_version=${py_version} \
-      --env cuda_version=${cuda_version} \
-      $cuda_image \
-      /workspace/ci/build_pynvbench_wheel_inner.sh
-  # Prevent GHA runners from exhausting available storage with leftover images:
-  if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
-    docker rmi -f $cuda_image
-  fi
-)
-echo "::endgroup::"
-
-echo "Wheels in wheelhouse:"
-ls -la wheelhouse/
diff --git a/ci/build_pynvbench_wheel_for_cuda.sh b/ci/build_pynvbench_wheel_for_cuda.sh
index 79ebbd32..c0289ca2 100755
--- a/ci/build_pynvbench_wheel_for_cuda.sh
+++ b/ci/build_pynvbench_wheel_for_cuda.sh
@@ -22,16 +22,16 @@ set -euxo pipefail
 # This script builds a single wheel for the container's CUDA version
 # The /workspace pathnames are hard-wired here.
 
+# Determine CUDA version from nvcc early (needed for dev package installation)
+cuda_version=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+' | cut -d. -f1)
+echo "Detected CUDA version: ${cuda_version}"
+
 # Install GCC 13 toolset (needed for the build)
 /workspace/ci/util/retry.sh 5 30 dnf -y install gcc-toolset-13-gcc gcc-toolset-13-gcc-c++
 echo -e "#!/bin/bash\nsource /opt/rh/gcc-toolset-13/enable" >/etc/profile.d/enable_devtools.sh
 source /etc/profile.d/enable_devtools.sh
 
-# Check what's available
-which gcc
-gcc --version
-which nvcc
-nvcc --version
+# Note: CUDA dev packages (NVML, CUPTI, CUDART) are already installed in rapidsai/ci-wheel containers
 
 # Set up Python environment
 source /workspace/ci/pyenv_helper.sh
@@ -47,10 +47,6 @@ fi
 
 cd /workspace/python
 
-# Determine CUDA version from nvcc
-cuda_version=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+' | cut -d. -f1)
-echo "Detected CUDA version: ${cuda_version}"
-
 # Configure compilers
 export CXX="$(which g++)"
 export CUDACXX="$(which nvcc)"
@@ -59,6 +55,17 @@ export CUDAHOSTCXX="$(which g++)"
 # Build the wheel
 python -m pip wheel --no-deps --verbose --wheel-dir dist .
 
+# Temporarily rename wheel to include CUDA version to avoid collision during multi-CUDA build
+# The merge script will combine these into a single wheel
+for wheel in dist/pynvbench-*.whl; do
+    if [[ -f "$wheel" ]]; then
+        base_name=$(basename "$wheel" .whl)
+        new_name="${base_name}.cu${cuda_version}.whl"
+        mv "$wheel" "dist/${new_name}"
+        echo "Renamed wheel to: ${new_name}"
+    fi
+done
+
 # Move wheel to output directory
 mkdir -p /workspace/wheelhouse
-mv dist/pynvbench-*.whl /workspace/wheelhouse/
+mv dist/pynvbench-*.cu*.whl /workspace/wheelhouse/
diff --git a/ci/build_pynvbench_wheel_inner.sh b/ci/build_pynvbench_wheel_inner.sh
deleted file mode 100755
index 543deb6a..00000000
--- a/ci/build_pynvbench_wheel_inner.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-# Target script for `docker run` command in build_pynvbench_wheel.sh
-# The /workspace pathnames are hard-wired here.
-
-# Install GCC 13 toolset (needed for the build)
-/workspace/ci/util/retry.sh 5 30 dnf -y install gcc-toolset-13-gcc gcc-toolset-13-gcc-c++
-echo -e "#!/bin/bash\nsource /opt/rh/gcc-toolset-13/enable" >/etc/profile.d/enable_devtools.sh
-source /etc/profile.d/enable_devtools.sh
-
-# Check what's available
-which gcc
-gcc --version
-which nvcc
-nvcc --version
-
-# Set up Python environment
-source /workspace/ci/pyenv_helper.sh
-setup_python_env "${py_version}"
-which python
-python --version
-echo "Done setting up python env"
-
-# Ensure we have full git history for setuptools_scm
-if $(git rev-parse --is-shallow-repository); then
-  git fetch --unshallow
-fi
-
-cd /workspace/python
-
-# Determine CUDA version from nvcc
-cuda_major=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+' | cut -d. -f1)
-echo "Detected CUDA major version: ${cuda_major}"
-
-# Configure compilers:
-export CXX="$(which g++)"
-export CUDACXX="$(which nvcc)"
-export CUDAHOSTCXX="$(which g++)"
-
-# Build the wheel
-python -m pip wheel --no-deps --verbose --wheel-dir dist .
-
-# Install auditwheel for manylinux compliance
-python -m pip install auditwheel
-
-# Repair wheel to make it manylinux compliant
-mkdir -p dist_repaired
-for wheel in dist/pynvbench-*.whl; do
-    if [[ -f "$wheel" ]]; then
-        echo "Repairing wheel: $wheel"
-        python -m auditwheel repair \
-            --exclude 'libcuda.so.1' \
-            --exclude 'libnvidia-ml.so.1' \
-            "$wheel" \
-            --wheel-dir dist_repaired
-    fi
-done
-
-# Rename wheel to include CUDA version suffix
-mkdir -p /workspace/wheelhouse
-for wheel in dist_repaired/pynvbench-*.whl; do
-    if [[ -f "$wheel" ]]; then
-        base_name=$(basename "$wheel" .whl)
-        # Append CUDA version to the local version identifier
-        # e.g., pynvbench-0.1.0.dev1+gabc123-cp312-cp312-manylinux_2_28_x86_64.whl
-        # becomes pynvbench-0.1.0.dev1+gabc123.cu12-cp312-cp312-manylinux_2_28_x86_64.whl
-        if [[ "$base_name" =~ ^(.*)-cp([0-9]+)-cp([0-9]+)-(.*) ]]; then
-            pkg_version="${BASH_REMATCH[1]}"
-            py_tag="cp${BASH_REMATCH[2]}"
-            abi_tag="cp${BASH_REMATCH[3]}"
-            platform="${BASH_REMATCH[4]}"
-            # If version has a local part (contains +), append .cu${cuda_major} to it
-            # Otherwise add +cu${cuda_major}
-            if [[ "$pkg_version" =~ \+ ]]; then
-                new_version="${pkg_version}.cu${cuda_major}"
-            else
-                new_version="${pkg_version}+cu${cuda_major}"
-            fi
-            new_name="${new_version}-${py_tag}-${abi_tag}-${platform}.whl"
-            mv "$wheel" "/workspace/wheelhouse/${new_name}"
-            echo "Renamed wheel to: ${new_name}"
-        else
-            # Fallback if regex doesn't match
-            mv "$wheel" /workspace/wheelhouse/
-            echo "Moved wheel: $(basename $wheel)"
-        fi
-    fi
-done
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index e7c0c776..f6ced68a 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -25,12 +25,13 @@ CPMAddPackage(
 
 CPMAddPackage("gh:pybind/pybind11@3.0.0")
 
-# Determine CUDA major version for extension naming and directory structure
+# Determine CUDA major version for directory structure
 string(REGEX MATCH "^([0-9]+)" CUDA_VERSION_MAJOR "${CUDAToolkit_VERSION}")
 set(CUDA_VERSION_DIR "cu${CUDA_VERSION_MAJOR}")
+message(STATUS "Building extension for CUDA ${CUDA_VERSION_MAJOR}, output directory: cuda/bench/${CUDA_VERSION_DIR}")
 
-set(NVBENCH_EXTENSION_NAME "_nvbench_cu${CUDA_VERSION_MAJOR}")
-message(STATUS "Building extension: ${NVBENCH_EXTENSION_NAME} for CUDA ${CUDA_VERSION_MAJOR}, output directory: ${CUDA_VERSION_DIR}")
+# Extension name is always _nvbench (directory provides CUDA version separation)
+set(NVBENCH_EXTENSION_NAME "_nvbench")
 
 pybind11_add_module(${NVBENCH_EXTENSION_NAME} MODULE src/py_nvbench.cpp)
 target_compile_definitions(${NVBENCH_EXTENSION_NAME} PRIVATE PYBIND11_MODULE_NAME=${NVBENCH_EXTENSION_NAME})
diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py
index 0e8ce77d..a47bb394 100644
--- a/python/cuda/bench/__init__.py
+++ b/python/cuda/bench/__init__.py
@@ -53,7 +53,7 @@ def _get_cuda_major_version():
 
 _cuda_major = _get_cuda_major_version()
 _extra_name = f"cu{_cuda_major}"
-_module_fullname = f"cuda.bench.{_extra_name}._nvbench_cu{_cuda_major}"
+_module_fullname = f"cuda.bench.{_extra_name}._nvbench"
 
 try:
     _nvbench_module = importlib.import_module(_module_fullname)

From 35f5272eeee9a03ca6496004ab5f484c65645875 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 16:49:09 -0500
Subject: [PATCH 12/20] Some fixes

---
 python/CMakeLists.txt | 19 ++++++++-----------
 python/pyproject.toml |  2 +-
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index f6ced68a..1390caea 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -30,19 +30,16 @@ string(REGEX MATCH "^([0-9]+)" CUDA_VERSION_MAJOR "${CUDAToolkit_VERSION}")
 set(CUDA_VERSION_DIR "cu${CUDA_VERSION_MAJOR}")
 message(STATUS "Building extension for CUDA ${CUDA_VERSION_MAJOR}, output directory: cuda/bench/${CUDA_VERSION_DIR}")
 
-# Extension name is always _nvbench (directory provides CUDA version separation)
-set(NVBENCH_EXTENSION_NAME "_nvbench")
+pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp)
+target_compile_definitions(_nvbench PRIVATE PYBIND11_MODULE_NAME=_nvbench)
+target_link_libraries(_nvbench PUBLIC nvbench::nvbench)
+target_link_libraries(_nvbench PRIVATE CUDA::cudart_static CUDA::cuda_driver)
 
-pybind11_add_module(${NVBENCH_EXTENSION_NAME} MODULE src/py_nvbench.cpp)
-target_compile_definitions(${NVBENCH_EXTENSION_NAME} PRIVATE PYBIND11_MODULE_NAME=${NVBENCH_EXTENSION_NAME})
-target_link_libraries(${NVBENCH_EXTENSION_NAME} PUBLIC nvbench::nvbench)
-target_link_libraries(${NVBENCH_EXTENSION_NAME} PRIVATE CUDA::cudart_static CUDA::cuda_driver)
+set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
+set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
+set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
-set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INSTALL_RPATH "$ORIGIN")
-set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
-set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-install(TARGETS ${NVBENCH_EXTENSION_NAME} DESTINATION cuda/bench/${CUDA_VERSION_DIR})
+install(TARGETS _nvbench DESTINATION cuda/bench/${CUDA_VERSION_DIR})
 
 # Determine target that nvbench::nvbench is an alias of,
 # necessary because ALIAS targets cannot be installed
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 08161210..e752eca2 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -55,7 +55,7 @@ build-dir = "build/{wheel_tag}"
 
 [tool.scikit-build.cmake]
 version = ">=3.30.4"
-args = ["-DNVBench_ENABLE_NVML=OFF", "-DNVBench_ENABLE_CUPTI=OFF"]
+args = []
 build-type = "Release"
 source-dir = "."
 

From 933b951aea71b86654e1d190a3d79d2f4e16d496 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 16:50:59 -0500
Subject: [PATCH 13/20] We can just use TEST_EXTRA

---
 ci/test_pynvbench_inner.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/test_pynvbench_inner.sh b/ci/test_pynvbench_inner.sh
index 4a7bd1c1..4303a5dd 100755
--- a/ci/test_pynvbench_inner.sh
+++ b/ci/test_pynvbench_inner.sh
@@ -42,8 +42,8 @@ fi
 CUDA_EXTRA="${cuda_extra:-cu${cuda_version}}"
 TEST_EXTRA="test-cu${cuda_version}"
 
-echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${CUDA_EXTRA},${TEST_EXTRA}"
-python -m pip install "${PYNVBENCH_WHEEL_PATH}[${CUDA_EXTRA},${TEST_EXTRA}]"
+echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${TEST_EXTRA}"
+python -m pip install "${PYNVBENCH_WHEEL_PATH}[${TEST_EXTRA}]"
 
 # Run tests
 cd "/workspace/python/test/"

From 13cb606165d7beb364f243c0d41c72f79d534c8e Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 16:52:41 -0500
Subject: [PATCH 14/20] Reduce CI matrix

---
 .../build-and-test-python-wheels.yml          |   6 +-
 .github/workflows/pr.yml                      | 102 +++++++++---------
 2 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml
index 46f391bc..8ca1a179 100644
--- a/.github/workflows/build-and-test-python-wheels.yml
+++ b/.github/workflows/build-and-test-python-wheels.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python: ['3.10', '3.11', '3.12', '3.13']
+        python: ['3.13']
 
     steps:
       - name: Checkout repository
@@ -54,8 +54,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda: ['12', '13']
-        python: ['3.10', '3.11', '3.12', '3.13']
+        cuda: ['13']
+        python: ['3.13']
 
     steps:
       - name: Checkout repository
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index c42cda01..3653f510 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -35,46 +35,46 @@ permissions:
   pull-requests: read
 
 jobs:
-  compute-matrix:
-    name: Compute matrix
-    runs-on: ubuntu-latest
-    outputs:
-      DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
-      PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
-      PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
-      base_sha: ${{ steps.export-pr-info.outputs.base_sha }}
-      pr_number: ${{ steps.export-pr-info.outputs.pr_number }}
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v4
-      - name: Lookup PR info
-        id: get-pr-info
-        uses: nv-gha-runners/get-pr-info@main
-      - name: Export PR info
-        id: export-pr-info
-        run: |
-          echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}"
-          echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}"
-      - name: Compute matrix outputs
-        id: set-outputs
-        run: |
-          .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
+  # compute-matrix:
+  #   name: Compute matrix
+  #   runs-on: ubuntu-latest
+  #   outputs:
+  #     DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
+  #     PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
+  #     PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
+  #     base_sha: ${{ steps.export-pr-info.outputs.base_sha }}
+  #     pr_number: ${{ steps.export-pr-info.outputs.pr_number }}
+  #   steps:
+  #     - name: Checkout repo
+  #       uses: actions/checkout@v4
+  #     - name: Lookup PR info
+  #       id: get-pr-info
+  #       uses: nv-gha-runners/get-pr-info@main
+  #     - name: Export PR info
+  #       id: export-pr-info
+  #       run: |
+  #         echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}"
+  #         echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}"
+  #     - name: Compute matrix outputs
+  #       id: set-outputs
+  #       run: |
+  #         .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
 
-  nvbench:
-    name: NVBench CUDA${{ matrix.cuda_host_combination }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: compute-matrix
-    uses: ./.github/workflows/dispatch-build-and-test.yml
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
-    with:
-      project_name: "nvbench"
-      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
-      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+  # nvbench:
+  #   name: NVBench CUDA${{ matrix.cuda_host_combination }}
+  #   permissions:
+  #     id-token: write
+  #     contents: read
+  #   needs: compute-matrix
+  #   uses: ./.github/workflows/dispatch-build-and-test.yml
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
+  #   with:
+  #     project_name: "nvbench"
+  #     per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
+  #     devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
 
   python-wheels:
     name: Python Wheels
@@ -83,16 +83,16 @@ jobs:
       contents: read
     uses: ./.github/workflows/build-and-test-python-wheels.yml
 
-  verify-devcontainers:
-    name: Verify Dev Containers
-    if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}
-    needs: compute-matrix
-    permissions:
-      id-token: write
-      contents: read
-    uses: ./.github/workflows/verify-devcontainers.yml
-    with:
-      base_sha: ${{ needs.compute-matrix.outputs.base_sha }}
+  # verify-devcontainers:
+  #   name: Verify Dev Containers
+  #   if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}
+  #   needs: compute-matrix
+  #   permissions:
+  #     id-token: write
+  #     contents: read
+  #   uses: ./.github/workflows/verify-devcontainers.yml
+  #   with:
+  #     base_sha: ${{ needs.compute-matrix.outputs.base_sha }}
 
   # This job is the final job that runs after all other jobs and is used for branch protection status checks.
   # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
@@ -102,9 +102,9 @@ jobs:
     name: CI
     if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
     needs:
-      - nvbench
+      # - nvbench
       - python-wheels
-      - verify-devcontainers
+      # - verify-devcontainers
     steps:
       - name: Check status of all precursor jobs
         if: >-

From 9af979d3f305ceb2f616e29f401ea1dd982d412a Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 17:06:20 -0500
Subject: [PATCH 15/20] Add exclusions

---
 ci/build_multi_cuda_wheel.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh
index a25cf39f..29f7ac41 100755
--- a/ci/build_multi_cuda_wheel.sh
+++ b/ci/build_multi_cuda_wheel.sh
@@ -163,6 +163,10 @@ for wheel in wheelhouse_merged/pynvbench-*.whl; do
     $PYTHON -m auditwheel repair \
         --exclude 'libcuda.so.1' \
         --exclude 'libnvidia-ml.so.1' \
+        --exclude 'libcupti.so.12' \
+        --exclude 'libcupti.so.13' \
+        --exclude 'libnvperf_host.so' \
+        --exclude 'libnvperf_target.so' \
         "$wheel" \
         --wheel-dir wheelhouse_final
 done

From ec3615b1a5f17f1220b1f03311b4aaf92e779c5e Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 4 Dec 2025 18:12:28 -0500
Subject: [PATCH 16/20] Revert "Reduce CI matrix"

This reverts commit 13cb606165d7beb364f243c0d41c72f79d534c8e.
---
 .../build-and-test-python-wheels.yml          |   6 +-
 .github/workflows/pr.yml                      | 102 +++++++++---------
 2 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml
index 8ca1a179..46f391bc 100644
--- a/.github/workflows/build-and-test-python-wheels.yml
+++ b/.github/workflows/build-and-test-python-wheels.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python: ['3.13']
+        python: ['3.10', '3.11', '3.12', '3.13']
 
     steps:
       - name: Checkout repository
@@ -54,8 +54,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda: ['13']
-        python: ['3.13']
+        cuda: ['12', '13']
+        python: ['3.10', '3.11', '3.12', '3.13']
 
     steps:
       - name: Checkout repository
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 3653f510..c42cda01 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -35,46 +35,46 @@ permissions:
   pull-requests: read
 
 jobs:
-  # compute-matrix:
-  #   name: Compute matrix
-  #   runs-on: ubuntu-latest
-  #   outputs:
-  #     DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
-  #     PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
-  #     PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
-  #     base_sha: ${{ steps.export-pr-info.outputs.base_sha }}
-  #     pr_number: ${{ steps.export-pr-info.outputs.pr_number }}
-  #   steps:
-  #     - name: Checkout repo
-  #       uses: actions/checkout@v4
-  #     - name: Lookup PR info
-  #       id: get-pr-info
-  #       uses: nv-gha-runners/get-pr-info@main
-  #     - name: Export PR info
-  #       id: export-pr-info
-  #       run: |
-  #         echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}"
-  #         echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}"
-  #     - name: Compute matrix outputs
-  #       id: set-outputs
-  #       run: |
-  #         .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
+  compute-matrix:
+    name: Compute matrix
+    runs-on: ubuntu-latest
+    outputs:
+      DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
+      PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
+      PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
+      base_sha: ${{ steps.export-pr-info.outputs.base_sha }}
+      pr_number: ${{ steps.export-pr-info.outputs.pr_number }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+      - name: Lookup PR info
+        id: get-pr-info
+        uses: nv-gha-runners/get-pr-info@main
+      - name: Export PR info
+        id: export-pr-info
+        run: |
+          echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}"
+          echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}"
+      - name: Compute matrix outputs
+        id: set-outputs
+        run: |
+          .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
 
-  # nvbench:
-  #   name: NVBench CUDA${{ matrix.cuda_host_combination }}
-  #   permissions:
-  #     id-token: write
-  #     contents: read
-  #   needs: compute-matrix
-  #   uses: ./.github/workflows/dispatch-build-and-test.yml
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
-  #   with:
-  #     project_name: "nvbench"
-  #     per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
-  #     devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+  nvbench:
+    name: NVBench CUDA${{ matrix.cuda_host_combination }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: compute-matrix
+    uses: ./.github/workflows/dispatch-build-and-test.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
+    with:
+      project_name: "nvbench"
+      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
+      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
 
   python-wheels:
     name: Python Wheels
@@ -83,16 +83,16 @@ jobs:
       contents: read
     uses: ./.github/workflows/build-and-test-python-wheels.yml
 
-  # verify-devcontainers:
-  #   name: Verify Dev Containers
-  #   if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}
-  #   needs: compute-matrix
-  #   permissions:
-  #     id-token: write
-  #     contents: read
-  #   uses: ./.github/workflows/verify-devcontainers.yml
-  #   with:
-  #     base_sha: ${{ needs.compute-matrix.outputs.base_sha }}
+  verify-devcontainers:
+    name: Verify Dev Containers
+    if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}
+    needs: compute-matrix
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/verify-devcontainers.yml
+    with:
+      base_sha: ${{ needs.compute-matrix.outputs.base_sha }}
 
   # This job is the final job that runs after all other jobs and is used for branch protection status checks.
   # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
@@ -102,9 +102,9 @@ jobs:
     name: CI
     if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
     needs:
-      # - nvbench
+      - nvbench
       - python-wheels
-      # - verify-devcontainers
+      - verify-devcontainers
     steps:
       - name: Check status of all precursor jobs
         if: >-

From d85f6ba6d59be7d99aa88d5dc42411ab8d68fe71 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Tue, 27 Jan 2026 22:25:45 +0000
Subject: [PATCH 17/20] Use appropriate cmake variable to get CUDA major
 version

---
 python/CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 36c474e5..7f8548c4 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -26,9 +26,8 @@ CPMAddPackage(
 CPMAddPackage("gh:pybind/pybind11@3.0.1")
 
 # Determine CUDA major version for directory structure
-string(REGEX MATCH "^([0-9]+)" CUDA_VERSION_MAJOR "${CUDAToolkit_VERSION}")
-set(CUDA_VERSION_DIR "cu${CUDA_VERSION_MAJOR}")
-message(STATUS "Building extension for CUDA ${CUDA_VERSION_MAJOR}, output directory: cuda/bench/${CUDA_VERSION_DIR}")
+set(CUDA_VERSION_DIR "cu${CUDAToolkit_VERSION_MAJOR}")
+message(STATUS "Building extension for CUDA ${CUDAToolkit_VERSION_MAJOR}, output directory: cuda/bench/${CUDA_VERSION_DIR}")
 
 add_library(_nvbench MODULE src/py_nvbench.cpp)
 target_include_directories(_nvbench PRIVATE ${Python_INCLUDE_DIRS})

From b06d87c88fcfda86af9c18ee06d73e933822425b Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Tue, 27 Jan 2026 22:27:29 +0000
Subject: [PATCH 18/20] Reuse cuda image name to reduce redundancy

---
 ci/build_multi_cuda_wheel.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh
index 29f7ac41..058dd311 100755
--- a/ci/build_multi_cuda_wheel.sh
+++ b/ci/build_multi_cuda_wheel.sh
@@ -54,13 +54,14 @@ readonly devcontainer_version=25.12
 readonly devcontainer_distro=rockylinux8
 
 if [[ "$(uname -m)" == "aarch64" ]]; then
-  readonly cuda12_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda12_version}-${devcontainer_distro}-py${py_version}-arm64
-  readonly cuda13_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda13_version}-${devcontainer_distro}-py${py_version}-arm64
+  readonly host_arch_suffix="-arm64"
 else
-  readonly cuda12_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda12_version}-${devcontainer_distro}-py${py_version}
-  readonly cuda13_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda13_version}-${devcontainer_distro}-py${py_version}
+  readonly host_arch_suffix=""
 fi
 
+readonly cuda12_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda12_version}-${devcontainer_distro}-py${py_version}${host_arch_suffix}
+readonly cuda13_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda13_version}-${devcontainer_distro}-py${py_version}${host_arch_suffix}
+
 mkdir -p wheelhouse
 
 for ctk in 12 13; do

From 48195f7be3e6f3a76c8b987f7969fdd918820f3c Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Tue, 27 Jan 2026 22:29:11 +0000
Subject: [PATCH 19/20] Remove unused cuda-bindings dep

---
 python/pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index e752eca2..e4b2ca4c 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -30,10 +30,10 @@ readme = { file = "README.md", content-type = "text/markdown" }
 
 [project.optional-dependencies]
 # CUDA 12.x dependencies
-cu12 = ["cuda-bindings>=12.0.0,<13.0.0", "nvidia-cuda-cupti-cu12"]
+cu12 = ["nvidia-cuda-cupti-cu12"]
 
 # CUDA 13.x dependencies
-cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti>=13.0"]
+cu13 = ["nvidia-cuda-cupti>=13.0"]
 
 # Test dependencies for CUDA 12
 test-cu12 = ["pynvbench[cu12]", "pytest", "cupy-cuda12x", "numba"]

From 50997476b7d76ebc9e8b39a562511a3888aaa177 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Tue, 27 Jan 2026 23:18:51 +0000
Subject: [PATCH 20/20] Add cuda-bindings back to deps

---
 python/pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index e4b2ca4c..e752eca2 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -30,10 +30,10 @@ readme = { file = "README.md", content-type = "text/markdown" }
 
 [project.optional-dependencies]
 # CUDA 12.x dependencies
-cu12 = ["nvidia-cuda-cupti-cu12"]
+cu12 = ["cuda-bindings>=12.0.0,<13.0.0", "nvidia-cuda-cupti-cu12"]
 
 # CUDA 13.x dependencies
-cu13 = ["nvidia-cuda-cupti>=13.0"]
+cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti>=13.0"]
 
 # Test dependencies for CUDA 12
 test-cu12 = ["pynvbench[cu12]", "pytest", "cupy-cuda12x", "numba"]