diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml
index d51f504e..46f391bc 100644
--- a/.github/workflows/build-and-test-python-wheels.yml
+++ b/.github/workflows/build-and-test-python-wheels.yml
@@ -9,9 +9,8 @@ defaults:
     shell: bash --noprofile --norc -euo pipefail {0}
 
 jobs:
-  # Build wheels for all CUDA/Python combinations
   build-wheels:
-    name: Build wheel (CUDA ${{ matrix.cuda }}, Python ${{ matrix.python }})
+    name: Build wheel (Python ${{ matrix.python }})
     runs-on: ubuntu-latest
     permissions:
       id-token: write
@@ -19,7 +18,6 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda: ['12', '13']
         python: ['3.10', '3.11', '3.12', '3.13']
 
     steps:
@@ -32,19 +30,20 @@ jobs:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
 
-      - name: Build wheel
+      - name: Build multi-CUDA wheel
         run: |
-          bash ci/build_pynvbench_wheel.sh -py-version ${{ matrix.python }} -cuda-version ${{ matrix.cuda }}
+          bash ci/build_multi_cuda_wheel.sh -py-version ${{ matrix.python }}
 
       - name: Upload wheel artifact
         uses: actions/upload-artifact@v4
         with:
-          name: wheel-pynvbench-cu${{ matrix.cuda }}-py${{ matrix.python }}
+          name: wheel-pynvbench-py${{ matrix.python }}
           path: wheelhouse/*.whl
           retention-days: 7
           if-no-files-found: error
 
-  # Test wheels for all CUDA/Python combinations
+  # Test wheels on all CUDA/Python combinations
+  # Each wheel contains extensions for both CUDA 12 & 13, runtime detection picks the right one
   test-wheels:
     name: Test wheel (CUDA ${{ matrix.cuda }}, Python ${{ matrix.python }})
     needs: build-wheels
@@ -68,7 +67,7 @@ jobs:
       - name: Download wheel artifact
         uses: actions/download-artifact@v4
         with:
-          name: wheel-pynvbench-cu${{ matrix.cuda }}-py${{ matrix.python }}
+          name: wheel-pynvbench-py${{ matrix.python }}
           path: wheelhouse
 
       - name: Test wheel
@@ -76,8 +75,10 @@ jobs:
           # Use the same rapidsai/ci-wheel Docker image as build
           if [[ "${{ matrix.cuda }}" == "12" ]]; then
             cuda_full_version="12.9.1"
+            cuda_extra="cu12"
           else
             cuda_full_version="13.0.1"
+            cuda_extra="cu13"
           fi
 
           docker run --rm \
@@ -86,6 +87,7 @@ jobs:
             --mount type=bind,source=$(pwd),target=/workspace/ \
             --env py_version=${{ matrix.python }} \
             --env cuda_version=${{ matrix.cuda }} \
+            --env cuda_extra="${cuda_extra}" \
             rapidsai/ci-wheel:25.12-cuda${cuda_full_version}-rockylinux8-py${{ matrix.python }} \
             /workspace/ci/test_pynvbench_inner.sh
 
diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml
deleted file mode 100644
index 37dfb3d7..00000000
--- a/.github/workflows/build-wheels.yml
+++ /dev/null
@@ -1,58 +0,0 @@
-name: Build Python Wheels (Manual)
-
-on:
-  workflow_dispatch:
-  workflow_call:
-
-defaults:
-  run:
-    shell: bash --noprofile --norc -euo pipefail {0}
-
-jobs:
-  build-wheels:
-    name: Build wheel (CUDA ${{ matrix.cuda }}, Python ${{ matrix.python }})
-    runs-on: ubuntu-latest
-    permissions:
-      id-token: write
-      contents: read
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda: ['12', '13']
-        python: ['3.10', '3.11', '3.12', '3.13']
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          persist-credentials: false
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Build wheel
-        run: |
-          bash ci/build_pynvbench_wheel.sh -py-version ${{ matrix.python }} -cuda-version ${{ matrix.cuda }}
-
-      - name: Upload wheel artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: wheel-pynvbench-cu${{ matrix.cuda }}-py${{ matrix.python }}
-          path: wheelhouse/*.whl
-          retention-days: 7
-          if-no-files-found: error
-
-  verify-wheels:
-    name: Verify all wheels built successfully
-    if: ${{ always() }}
-    needs: build-wheels
-    runs-on: ubuntu-latest
-    steps:
-      - name: Check build results
-        run: |
-          if [[ "${{ needs.build-wheels.result }}" != "success" ]]; then
-            echo "Wheel builds failed!"
-            exit 1
-          fi
-          echo "All wheels built successfully!"
diff --git a/.gitignore b/.gitignore
index 2f64907d..eec6d226 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,8 +11,13 @@ CMakeUserPresets.json
 
 # Python wheel builds
 wheelhouse/
+wheelhouse_merged/
+wheelhouse_final/
+dist/
+dist_*/
 *.whl
 *.egg-info/
 __pycache__/
 *.pyc
 *.pyo
+.pytest_cache/
diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh
new file mode 100755
index 00000000..058dd311
--- /dev/null
+++ b/ci/build_multi_cuda_wheel.sh
@@ -0,0 +1,192 @@
+#!/bin/bash
+# Apache 2.0 License
+# Copyright 2024-2025 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 with the LLVM exception
+# (the "License"); you may not use this file except in compliance with
+# the License.
+#
+# You may obtain a copy of the License at
+#
+#     http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Build a multi-CUDA wheel for the given Python version
+# This builds separate wheels for each supported CUDA major version,
+# and then merges them into a single wheel containing extensions
+# for all CUDA versions. At runtime, depending on the installed CUDA version,
+# the correct extension will be chosen.
+
+set -euo pipefail
+
+ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+usage="Usage: $0 -py-version <python_version> [additional options...]"
+
+source "$ci_dir/util/python/common_arg_parser.sh"
+parse_python_args "$@"
+
+# Check if py_version was provided (this script requires it)
+require_py_version "$usage" || exit 1
+
+echo "Docker socket: " $(ls /var/run/docker.sock)
+
+# Set HOST_WORKSPACE if not already set (for local runs)
+if [[ -z "${HOST_WORKSPACE:-}" ]]; then
+  # Get the repository root
+  HOST_WORKSPACE="$(cd "${ci_dir}/.." && pwd)"
+  echo "Setting HOST_WORKSPACE to: $HOST_WORKSPACE"
+fi
+
+# pynvbench must be built in a container that can produce manylinux wheels,
+# and has the CUDA toolkit installed. We use the rapidsai/ci-wheel image for this.
+# We build separate wheels using separate containers for each CUDA version,
+# then merge them into a single wheel.
+
+readonly cuda12_version=12.9.1
+readonly cuda13_version=13.0.1
+readonly devcontainer_version=25.12
+readonly devcontainer_distro=rockylinux8
+
+if [[ "$(uname -m)" == "aarch64" ]]; then
+  readonly host_arch_suffix="-arm64"
+else
+  readonly host_arch_suffix=""
+fi
+
+readonly cuda12_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda12_version}-${devcontainer_distro}-py${py_version}${host_arch_suffix}
+readonly cuda13_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda13_version}-${devcontainer_distro}-py${py_version}${host_arch_suffix}
+
+mkdir -p wheelhouse
+
+for ctk in 12 13; do
+  image=$(eval echo \$cuda${ctk}_image)
+  echo "::group::⚒️ Building CUDA ${ctk} wheel on ${image}"
+  (
+    set -x
+    docker pull $image
+    docker run --rm -i \
+        --workdir /workspace/python \
+        --mount type=bind,source=${HOST_WORKSPACE},target=/workspace/ \
+        --env py_version=${py_version} \
+        $image \
+        /workspace/ci/build_pynvbench_wheel_for_cuda.sh
+    # Prevent GHA runners from exhausting available storage with leftover images:
+    if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+      docker rmi -f $image
+    fi
+  )
+  echo "::endgroup::"
+done
+
+echo "Merging CUDA wheels..."
+
+# Detect python command
+if command -v python &> /dev/null; then
+  PYTHON=python
+elif command -v python3 &> /dev/null; then
+  PYTHON=python3
+else
+  echo "Error: No python found"
+  exit 1
+fi
+
+# Needed for unpacking and repacking wheels.
+$PYTHON -m pip install --break-system-packages wheel
+
+# Find the built wheels (temporarily suffixed with .cu12/.cu13 to avoid collision)
+cu12_wheel=$(find wheelhouse -name "*cu12*.whl" | head -1)
+cu13_wheel=$(find wheelhouse -name "*cu13*.whl" | head -1)
+
+if [[ -z "$cu12_wheel" ]]; then
+  echo "Error: CUDA 12 wheel not found in wheelhouse/"
+  ls -la wheelhouse/
+  exit 1
+fi
+
+if [[ -z "$cu13_wheel" ]]; then
+  echo "Error: CUDA 13 wheel not found in wheelhouse/"
+  ls -la wheelhouse/
+  exit 1
+fi
+
+if [[ "$cu12_wheel" == "$cu13_wheel" ]]; then
+  echo "Error: Only one wheel found, expected two (CUDA 12 and CUDA 13)"
+  ls -la wheelhouse/
+  exit 1
+fi
+
+echo "Found CUDA 12 wheel: $cu12_wheel"
+echo "Found CUDA 13 wheel: $cu13_wheel"
+
+# Convert to absolute paths before changing directory
+cu12_wheel=$(readlink -f "$cu12_wheel")
+cu13_wheel=$(readlink -f "$cu13_wheel")
+
+# Merge the wheels manually
+mkdir -p wheelhouse_merged
+cd wheelhouse_merged
+
+# Unpack CUDA 12 wheel (this will be our base)
+$PYTHON -m wheel unpack "$cu12_wheel"
+base_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1)
+
+# Unpack CUDA 13 wheel into a temporary subdirectory
+mkdir cu13_tmp
+cd cu13_tmp
+$PYTHON -m wheel unpack "$cu13_wheel"
+cu13_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1)
+
+# Copy the cu13/ directory from CUDA 13 wheel into the base wheel
+cp -r "$cu13_dir"/cuda/bench/cu13 "../$base_dir/cuda/bench/"
+
+# Go back and clean up
+cd ..
+rm -rf cu13_tmp
+
+# Remove RECORD file to let wheel recreate it
+rm -f "$base_dir"/*.dist-info/RECORD
+
+# Repack the merged wheel
+$PYTHON -m wheel pack "$base_dir"
+
+cd ..
+
+# Install auditwheel and repair the merged wheel
+$PYTHON -m pip install --break-system-packages auditwheel
+for wheel in wheelhouse_merged/pynvbench-*.whl; do
+    echo "Repairing merged wheel: $wheel"
+    $PYTHON -m auditwheel repair \
+        --exclude 'libcuda.so.1' \
+        --exclude 'libnvidia-ml.so.1' \
+        --exclude 'libcupti.so.12' \
+        --exclude 'libcupti.so.13' \
+        --exclude 'libnvperf_host.so' \
+        --exclude 'libnvperf_target.so' \
+        "$wheel" \
+        --wheel-dir wheelhouse_final
+done
+
+# Clean up intermediate files and move only the final merged wheel to wheelhouse
+rm -rf wheelhouse/*  # Clean existing wheelhouse
+mkdir -p wheelhouse
+
+# Move only the final repaired merged wheel
+if ls wheelhouse_final/pynvbench-*.whl 1> /dev/null 2>&1; then
+    mv wheelhouse_final/pynvbench-*.whl wheelhouse/
+    echo "Final merged wheel moved to wheelhouse"
+else
+    echo "No final repaired wheel found, moving unrepaired merged wheel"
+    mv wheelhouse_merged/pynvbench-*.whl wheelhouse/
+fi
+
+# Clean up temporary directories
+rm -rf wheelhouse_merged wheelhouse_final
+
+echo "Final wheels in wheelhouse:"
+ls -la wheelhouse/
diff --git a/ci/build_pynvbench_wheel.sh b/ci/build_pynvbench_wheel.sh
deleted file mode 100755
index 306891be..00000000
--- a/ci/build_pynvbench_wheel.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-usage="Usage: $0 -py-version <python_version> -cuda-version <cuda_version>"
-
-source "$ci_dir/util/python/common_arg_parser.sh"
-
-# Parse arguments including CUDA version
-parse_python_args "$@"
-
-# Now parse CUDA version from remaining arguments
-cuda_version=""
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        -cuda-version=*)
-            cuda_version="${1#*=}"
-            shift
-            ;;
-        -cuda-version)
-            if [[ $# -lt 2 ]]; then
-                echo "Error: -cuda-version requires a value" >&2
-                exit 1
-            fi
-            cuda_version="$2"
-            shift 2
-            ;;
-        *)
-            shift
-            ;;
-    esac
-done
-
-# Check if py_version was provided (this script requires it)
-require_py_version "$usage" || exit 1
-
-if [[ -z "$cuda_version" ]]; then
-    echo "Error: -cuda-version is required"
-    echo "$usage"
-    exit 1
-fi
-
-echo "Docker socket: " $(ls /var/run/docker.sock)
-
-# Map cuda_version to full version
-if [[ "$cuda_version" == "12" ]]; then
-    cuda_full_version="12.9.1"
-    cuda_short="cu12"
-elif [[ "$cuda_version" == "13" ]]; then
-    cuda_full_version="13.0.1"
-    cuda_short="cu13"
-else
-    echo "Error: Unsupported CUDA version: $cuda_version"
-    exit 1
-fi
-
-# pynvbench must be built in a container that can produce manylinux wheels,
-# and has the CUDA toolkit installed. We use the rapidsai/ci-wheel image for this.
-readonly devcontainer_version=25.12
-readonly devcontainer_distro=rockylinux8
-
-if [[ "$(uname -m)" == "aarch64" ]]; then
-  readonly cuda_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda_full_version}-${devcontainer_distro}-py${py_version}-arm64
-else
-  readonly cuda_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda_full_version}-${devcontainer_distro}-py${py_version}
-fi
-
-mkdir -p wheelhouse
-
-echo "::group::⚒️ Building CUDA ${cuda_version} wheel on ${cuda_image}"
-(
-  set -x
-  docker pull $cuda_image
-  docker run --rm -i \
-      --workdir /workspace/python \
-      --mount type=bind,source=$(pwd),target=/workspace/ \
-      --env py_version=${py_version} \
-      --env cuda_version=${cuda_version} \
-      $cuda_image \
-      /workspace/ci/build_pynvbench_wheel_inner.sh
-  # Prevent GHA runners from exhausting available storage with leftover images:
-  if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
-    docker rmi -f $cuda_image
-  fi
-)
-echo "::endgroup::"
-
-echo "Wheels in wheelhouse:"
-ls -la wheelhouse/
diff --git a/ci/build_pynvbench_wheel_for_cuda.sh b/ci/build_pynvbench_wheel_for_cuda.sh
new file mode 100755
index 00000000..c0289ca2
--- /dev/null
+++ b/ci/build_pynvbench_wheel_for_cuda.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# Apache 2.0 License
+# Copyright 2024-2025 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 with the LLVM exception
+# (the "License"); you may not use this file except in compliance with
+# the License.
+#
+# You may obtain a copy of the License at
+#
+#     http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euxo pipefail
+
+# Target script for `docker run` command in build_multi_cuda_wheel.sh
+# This script builds a single wheel for the container's CUDA version
+# The /workspace pathnames are hard-wired here.
+
+# Determine CUDA version from nvcc early (needed for dev package installation)
+cuda_version=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+' | cut -d. -f1)
+echo "Detected CUDA version: ${cuda_version}"
+
+# Install GCC 13 toolset (needed for the build)
+/workspace/ci/util/retry.sh 5 30 dnf -y install gcc-toolset-13-gcc gcc-toolset-13-gcc-c++
+echo -e "#!/bin/bash\nsource /opt/rh/gcc-toolset-13/enable" >/etc/profile.d/enable_devtools.sh
+source /etc/profile.d/enable_devtools.sh
+
+# Note: CUDA dev packages (NVML, CUPTI, CUDART) are already installed in rapidsai/ci-wheel containers
+
+# Set up Python environment
+source /workspace/ci/pyenv_helper.sh
+setup_python_env "${py_version}"
+which python
+python --version
+echo "Done setting up python env"
+
+# Ensure we have full git history for setuptools_scm
+if $(git rev-parse --is-shallow-repository); then
+  git fetch --unshallow
+fi
+
+cd /workspace/python
+
+# Configure compilers
+export CXX="$(which g++)"
+export CUDACXX="$(which nvcc)"
+export CUDAHOSTCXX="$(which g++)"
+
+# Build the wheel
+python -m pip wheel --no-deps --verbose --wheel-dir dist .
+
+# Temporarily rename wheel to include CUDA version to avoid collision during multi-CUDA build
+# The merge script will combine these into a single wheel
+for wheel in dist/pynvbench-*.whl; do
+    if [[ -f "$wheel" ]]; then
+        base_name=$(basename "$wheel" .whl)
+        new_name="${base_name}.cu${cuda_version}.whl"
+        mv "$wheel" "dist/${new_name}"
+        echo "Renamed wheel to: ${new_name}"
+    fi
+done
+
+# Move wheel to output directory
+mkdir -p /workspace/wheelhouse
+mv dist/pynvbench-*.cu*.whl /workspace/wheelhouse/
diff --git a/ci/build_pynvbench_wheel_inner.sh b/ci/build_pynvbench_wheel_inner.sh
deleted file mode 100755
index 543deb6a..00000000
--- a/ci/build_pynvbench_wheel_inner.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-# Target script for `docker run` command in build_pynvbench_wheel.sh
-# The /workspace pathnames are hard-wired here.
-
-# Install GCC 13 toolset (needed for the build)
-/workspace/ci/util/retry.sh 5 30 dnf -y install gcc-toolset-13-gcc gcc-toolset-13-gcc-c++
-echo -e "#!/bin/bash\nsource /opt/rh/gcc-toolset-13/enable" >/etc/profile.d/enable_devtools.sh
-source /etc/profile.d/enable_devtools.sh
-
-# Check what's available
-which gcc
-gcc --version
-which nvcc
-nvcc --version
-
-# Set up Python environment
-source /workspace/ci/pyenv_helper.sh
-setup_python_env "${py_version}"
-which python
-python --version
-echo "Done setting up python env"
-
-# Ensure we have full git history for setuptools_scm
-if $(git rev-parse --is-shallow-repository); then
-  git fetch --unshallow
-fi
-
-cd /workspace/python
-
-# Determine CUDA version from nvcc
-cuda_major=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+' | cut -d. -f1)
-echo "Detected CUDA major version: ${cuda_major}"
-
-# Configure compilers:
-export CXX="$(which g++)"
-export CUDACXX="$(which nvcc)"
-export CUDAHOSTCXX="$(which g++)"
-
-# Build the wheel
-python -m pip wheel --no-deps --verbose --wheel-dir dist .
-
-# Install auditwheel for manylinux compliance
-python -m pip install auditwheel
-
-# Repair wheel to make it manylinux compliant
-mkdir -p dist_repaired
-for wheel in dist/pynvbench-*.whl; do
-    if [[ -f "$wheel" ]]; then
-        echo "Repairing wheel: $wheel"
-        python -m auditwheel repair \
-            --exclude 'libcuda.so.1' \
-            --exclude 'libnvidia-ml.so.1' \
-            "$wheel" \
-            --wheel-dir dist_repaired
-    fi
-done
-
-# Rename wheel to include CUDA version suffix
-mkdir -p /workspace/wheelhouse
-for wheel in dist_repaired/pynvbench-*.whl; do
-    if [[ -f "$wheel" ]]; then
-        base_name=$(basename "$wheel" .whl)
-        # Append CUDA version to the local version identifier
-        # e.g., pynvbench-0.1.0.dev1+gabc123-cp312-cp312-manylinux_2_28_x86_64.whl
-        # becomes pynvbench-0.1.0.dev1+gabc123.cu12-cp312-cp312-manylinux_2_28_x86_64.whl
-        if [[ "$base_name" =~ ^(.*)-cp([0-9]+)-cp([0-9]+)-(.*) ]]; then
-            pkg_version="${BASH_REMATCH[1]}"
-            py_tag="cp${BASH_REMATCH[2]}"
-            abi_tag="cp${BASH_REMATCH[3]}"
-            platform="${BASH_REMATCH[4]}"
-            # If version has a local part (contains +), append .cu${cuda_major} to it
-            # Otherwise add +cu${cuda_major}
-            if [[ "$pkg_version" =~ \+ ]]; then
-                new_version="${pkg_version}.cu${cuda_major}"
-            else
-                new_version="${pkg_version}+cu${cuda_major}"
-            fi
-            new_name="${new_version}-${py_tag}-${abi_tag}-${platform}.whl"
-            mv "$wheel" "/workspace/wheelhouse/${new_name}"
-            echo "Renamed wheel to: ${new_name}"
-        else
-            # Fallback if regex doesn't match
-            mv "$wheel" /workspace/wheelhouse/
-            echo "Moved wheel: $(basename $wheel)"
-        fi
-    fi
-done
diff --git a/ci/test_pynvbench.sh b/ci/test_pynvbench.sh
index b021f0b5..53451515 100755
--- a/ci/test_pynvbench.sh
+++ b/ci/test_pynvbench.sh
@@ -40,11 +40,13 @@ if [[ -z "$cuda_version" ]]; then
     exit 1
 fi
 
-# Map cuda_version to full version
+# Map cuda_version to full version and set CUDA extra
 if [[ "$cuda_version" == "12" ]]; then
     cuda_full_version="12.9.1"
+    cuda_extra="cu12"
 elif [[ "$cuda_version" == "13" ]]; then
     cuda_full_version="13.0.1"
+    cuda_extra="cu13"
 else
     echo "Error: Unsupported CUDA version: $cuda_version"
     exit 1
@@ -66,9 +68,11 @@ echo "::group::🧪 Testing CUDA ${cuda_version} wheel on ${cuda_image}"
   docker pull $cuda_image
   docker run --rm -i \
       --workdir /workspace \
+      --gpus all \
       --mount type=bind,source=$(pwd),target=/workspace/ \
       --env py_version=${py_version} \
       --env cuda_version=${cuda_version} \
+      --env cuda_extra="${cuda_extra}" \
       $cuda_image \
       /workspace/ci/test_pynvbench_inner.sh
   # Prevent GHA runners from exhausting available storage with leftover images:
diff --git a/ci/test_pynvbench_inner.sh b/ci/test_pynvbench_inner.sh
index 80748fa9..4303a5dd 100755
--- a/ci/test_pynvbench_inner.sh
+++ b/ci/test_pynvbench_inner.sh
@@ -24,19 +24,26 @@ echo "CUDA version: $(nvcc --version | grep release)"
 # Wheel should be in /workspace/wheelhouse (downloaded by workflow or built locally)
 WHEELHOUSE_DIR="/workspace/wheelhouse"
 
-# Find and install pynvbench wheel
-# Look for .cu${cuda_version} in the version string (e.g., pynvbench-0.0.1.dev1+g123.cu12-...)
-PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*.cu${cuda_version}-*.whl 2>/dev/null | head -1)"
+# Find the pynvbench wheel (multi-CUDA wheel)
+# Prefer manylinux wheels, fall back to any wheel
+PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*manylinux*.whl 2>/dev/null | head -1)"
+if [[ -z "$PYNVBENCH_WHEEL_PATH" ]]; then
+    PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*.whl 2>/dev/null | head -1)"
+fi
+
 if [[ -z "$PYNVBENCH_WHEEL_PATH" ]]; then
     echo "Error: No pynvbench wheel found in ${WHEELHOUSE_DIR}"
-    echo "Looking for: pynvbench-*.cu${cuda_version}-*.whl"
     echo "Contents of ${WHEELHOUSE_DIR}:"
     ls -la ${WHEELHOUSE_DIR}/ || true
     exit 1
 fi
 
-echo "Installing wheel: $PYNVBENCH_WHEEL_PATH"
-python -m pip install "${PYNVBENCH_WHEEL_PATH}[test]"
+# Determine which CUDA extra to install (defaults to cu12 if not specified)
+CUDA_EXTRA="${cuda_extra:-cu${cuda_version}}"
+TEST_EXTRA="test-cu${cuda_version}"
+
+echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${TEST_EXTRA}"
+python -m pip install "${PYNVBENCH_WHEEL_PATH}[${TEST_EXTRA}]"
 
 # Run tests
 cd "/workspace/python/test/"
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index c89c085b..7f8548c4 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -25,19 +25,23 @@ CPMAddPackage(
 
 CPMAddPackage("gh:pybind/pybind11@3.0.1")
 
+# Determine CUDA major version for directory structure
+set(CUDA_VERSION_DIR "cu${CUDAToolkit_VERSION_MAJOR}")
+message(STATUS "Building extension for CUDA ${CUDAToolkit_VERSION_MAJOR}, output directory: cuda/bench/${CUDA_VERSION_DIR}")
+
 add_library(_nvbench MODULE src/py_nvbench.cpp)
 target_include_directories(_nvbench PRIVATE ${Python_INCLUDE_DIRS})
 target_link_libraries(_nvbench PUBLIC nvbench::nvbench)
-target_link_libraries(_nvbench PRIVATE CUDA::cudart_static pybind11::headers)
+target_link_libraries(_nvbench PRIVATE CUDA::cudart_static CUDA::cuda_driver pybind11::headers)
 
 set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
 set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
 set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(_nvbench PROPERTIES PREFIX "" SUFFIX "${PYTHON_MODULE_EXTENSION}")
 
-install(TARGETS _nvbench DESTINATION cuda/bench)
+install(TARGETS _nvbench DESTINATION cuda/bench/${CUDA_VERSION_DIR})
 
 # Determine target that nvbench::nvbench is an alias of,
 # necessary because ALIAS targets cannot be installed
 get_target_property(_aliased_target_name nvbench::nvbench ALIASED_TARGET)
-install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench)
+install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench/${CUDA_VERSION_DIR})
diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py
index e1d2282a..a47bb394 100644
--- a/python/cuda/bench/__init__.py
+++ b/python/cuda/bench/__init__.py
@@ -14,6 +14,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import importlib
 import importlib.metadata
 import warnings
 
@@ -31,29 +32,63 @@
         f"Version is set to fall-back value '{__version__}' instead."
     )
 
+
+# Detect CUDA runtime version and load appropriate extension
+def _get_cuda_major_version():
+    """Detect the CUDA runtime major version."""
+    try:
+        import cuda.bindings
+
+        # Get CUDA version from cuda-bindings package version
+        # cuda-bindings version is in format like "12.9.1" or "13.0.0"
+        version_str = cuda.bindings.__version__
+        major = int(version_str.split(".")[0])
+        return major
+    except ImportError:
+        raise ImportError(
+            "cuda-bindings is required for runtime CUDA version detection. "
+            "Install with: pip install pynvbench[cu12] or pip install pynvbench[cu13]"
+        )
+
+
+_cuda_major = _get_cuda_major_version()
+_extra_name = f"cu{_cuda_major}"
+_module_fullname = f"cuda.bench.{_extra_name}._nvbench"
+
+try:
+    _nvbench_module = importlib.import_module(_module_fullname)
+except ImportError as e:
+    raise ImportError(
+        f"No pynvbench extension found for CUDA {_cuda_major}.x. "
+        f"This wheel may not include support for your CUDA version. "
+        f"Supported CUDA versions: 12, 13. "
+        f"Original error: {e}"
+    )
+
+# Load required NVIDIA libraries
 for libname in ("cupti", "nvperf_target", "nvperf_host"):
     load_nvidia_dynamic_lib(libname)
 
-from cuda.bench._nvbench import (  # noqa: E402
-    Benchmark as Benchmark,
-)
-from cuda.bench._nvbench import (  # noqa: E402
-    CudaStream as CudaStream,
-)
-from cuda.bench._nvbench import (  # noqa: E402
-    Launch as Launch,
-)
-from cuda.bench._nvbench import (  # noqa: E402
-    NVBenchRuntimeError as NVBenchRuntimeError,
-)
-from cuda.bench._nvbench import (  # noqa: E402
-    State as State,
-)
-from cuda.bench._nvbench import (  # noqa: E402
-    register as register,
-)
-from cuda.bench._nvbench import (  # noqa: E402
-    run_all_benchmarks as run_all_benchmarks,
-)
+# Import and expose all public symbols from the CUDA-specific extension
+Benchmark = _nvbench_module.Benchmark
+CudaStream = _nvbench_module.CudaStream
+Launch = _nvbench_module.Launch
+NVBenchRuntimeError = _nvbench_module.NVBenchRuntimeError
+State = _nvbench_module.State
+register = _nvbench_module.register
+run_all_benchmarks = _nvbench_module.run_all_benchmarks
+test_cpp_exception = _nvbench_module.test_cpp_exception
+test_py_exception = _nvbench_module.test_py_exception
+
+# Expose the module as _nvbench for backward compatibility (e.g., for tests)
+_nvbench = _nvbench_module
 
-del load_nvidia_dynamic_lib
+# Clean up internal symbols
+del (
+    load_nvidia_dynamic_lib,
+    _nvbench_module,
+    _cuda_major,
+    _extra_name,
+    _module_fullname,
+    _get_cuda_major_version,
+)
diff --git a/python/pyproject.toml b/python/pyproject.toml
index d5356705..e752eca2 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -22,22 +22,26 @@ classifiers = [
 ]
 requires-python = ">=3.10"
 dependencies = [
-  # pathfinder
+  # pathfinder for finding CUDA libraries
   "cuda-pathfinder",
-
-  # Library expects to find shared libraries
-  # libcupti, libnvperf_target, libnvperf_host
-  # pathfinder is used to find it in the Python layout
-  "nvidia-cuda-cupti-cu12",
-
-  # The shared library
-  # libnvidia-ml must be installed system-wide
-  # (Debian package provider: libnvidia-compute)
 ]
 dynamic = ["version"]
 readme = { file = "README.md", content-type = "text/markdown" }
 
 [project.optional-dependencies]
+# CUDA 12.x dependencies
+cu12 = ["cuda-bindings>=12.0.0,<13.0.0", "nvidia-cuda-cupti-cu12"]
+
+# CUDA 13.x dependencies
+cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti>=13.0"]
+
+# Test dependencies for CUDA 12
+test-cu12 = ["pynvbench[cu12]", "pytest", "cupy-cuda12x", "numba"]
+
+# Test dependencies for CUDA 13
+test-cu13 = ["pynvbench[cu13]", "pytest", "cupy-cuda13x", "numba"]
+
+# Generic test dependencies (defaults to CUDA 12)
 test = ["pytest", "cupy-cuda12x", "numba"]
 
 [project.urls]
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 2b09574d..68a2f26f 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -272,7 +272,11 @@ std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry{};
 // If you modify these bindings, please be sure to update the
 // corresponding type hints in ``../cuda/nvbench/__init__.pyi``
 
-PYBIND11_MODULE(_nvbench, m)
+#ifndef PYBIND11_MODULE_NAME
+#define PYBIND11_MODULE_NAME _nvbench
+#endif
+
+PYBIND11_MODULE(PYBIND11_MODULE_NAME, m)
 {
   // == STEP 1
   // Set environment variable CUDA_MODULE_LOADING=EAGER