From 4b5fb32dea460fbaad7eae6f21ad2d6f5e2b0848 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 07:58:24 -0500 Subject: [PATCH 01/20] Add multi-cuda wheel build --- .../build-and-test-python-wheels.yml | 19 +- .github/workflows/build-wheels.yml | 9 +- .gitignore | 17 ++ ci/build_multi_cuda_wheel.sh | 199 ++++++++++++++++++ ci/build_pynvbench_wheel_for_cuda.sh | 77 +++++++ ci/test_pynvbench.sh | 6 +- ci/test_pynvbench_inner.sh | 15 ++ python/CMakeLists.txt | 27 ++- python/cuda/bench/__init__.py | 73 +++++-- python/pyproject.toml | 20 +- 10 files changed, 408 insertions(+), 54 deletions(-) create mode 100755 ci/build_multi_cuda_wheel.sh create mode 100755 ci/build_pynvbench_wheel_for_cuda.sh diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml index d51f504e..377e0d6a 100644 --- a/.github/workflows/build-and-test-python-wheels.yml +++ b/.github/workflows/build-and-test-python-wheels.yml @@ -9,9 +9,9 @@ defaults: shell: bash --noprofile --norc -euo pipefail {0} jobs: - # Build wheels for all CUDA/Python combinations + # Build multi-CUDA wheels (one wheel per Python version, includes both CUDA 12 & 13) build-wheels: - name: Build wheel (CUDA ${{ matrix.cuda }}, Python ${{ matrix.python }}) + name: Build wheel (Python ${{ matrix.python }}) runs-on: ubuntu-latest permissions: id-token: write @@ -19,7 +19,6 @@ jobs: strategy: fail-fast: false matrix: - cuda: ['12', '13'] python: ['3.10', '3.11', '3.12', '3.13'] steps: @@ -32,19 +31,20 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - - name: Build wheel + - name: Build multi-CUDA wheel run: | - bash ci/build_pynvbench_wheel.sh -py-version ${{ matrix.python }} -cuda-version ${{ matrix.cuda }} + bash ci/build_multi_cuda_wheel.sh -py-version ${{ matrix.python }} - name: Upload wheel artifact uses: actions/upload-artifact@v4 with: - name: wheel-pynvbench-cu${{ matrix.cuda }}-py${{ matrix.python }} + name: wheel-pynvbench-py${{ matrix.python }} path: wheelhouse/*.whl retention-days: 7 if-no-files-found: error - # Test wheels for all CUDA/Python combinations + # Test wheels on all CUDA/Python combinations + # Each wheel contains extensions for both CUDA 12 & 13, runtime detection picks the right one test-wheels: name: Test wheel (CUDA ${{ matrix.cuda }}, Python ${{ matrix.python }}) needs: build-wheels @@ -68,7 +68,7 @@ jobs: - name: Download wheel artifact uses: actions/download-artifact@v4 with: - name: wheel-pynvbench-cu${{ matrix.cuda }}-py${{ matrix.python }} + name: wheel-pynvbench-py${{ matrix.python }} path: wheelhouse - name: Test wheel @@ -76,8 +76,10 @@ jobs: # Use the same rapidsai/ci-wheel Docker image as build if [[ "${{ matrix.cuda }}" == "12" ]]; then cuda_full_version="12.9.1" + cuda_extra="[cu12]" else cuda_full_version="13.0.1" + cuda_extra="[cu13]" fi docker run --rm \ @@ -86,6 +88,7 @@ jobs: --mount type=bind,source=$(pwd),target=/workspace/ \ --env py_version=${{ matrix.python }} \ --env cuda_version=${{ matrix.cuda }} \ + --env cuda_extra="${cuda_extra}" \ rapidsai/ci-wheel:25.12-cuda${cuda_full_version}-rockylinux8-py${{ matrix.python }} \ /workspace/ci/test_pynvbench_inner.sh diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 37dfb3d7..4ae97491 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -10,7 +10,7 @@ defaults: jobs: build-wheels: - name: Build wheel (CUDA ${{ matrix.cuda }}, Python ${{ matrix.python }}) + name: Build wheel (Python ${{ matrix.python }}) runs-on: ubuntu-latest permissions: id-token: write @@ -18,7 +18,6 @@ jobs: strategy: fail-fast: false matrix: - cuda: ['12', '13'] python: ['3.10', '3.11', '3.12', '3.13'] steps: @@ -31,14 +30,14 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - - name: Build wheel + - name: Build multi-CUDA wheel run: | - bash ci/build_pynvbench_wheel.sh -py-version ${{ matrix.python }} -cuda-version ${{ matrix.cuda }} + bash ci/build_multi_cuda_wheel.sh -py-version ${{ matrix.python }} - name: Upload wheel artifact uses: actions/upload-artifact@v4 with: - name: wheel-pynvbench-cu${{ matrix.cuda }}-py${{ matrix.python }} + name: wheel-pynvbench-py${{ matrix.python }} path: wheelhouse/*.whl retention-days: 7 if-no-files-found: error diff --git a/.gitignore b/.gitignore index 2f64907d..a5c769c6 100644 --- a/.gitignore +++ b/.gitignore @@ -8,11 +8,28 @@ cmake-build-* *~ compile_commands.json CMakeUserPresets.json +<<<<<<< HEAD # Python wheel builds wheelhouse/ +======= +<<<<<<< Updated upstream +======= + +# Python wheel builds +wheelhouse/ +wheelhouse_merged/ +wheelhouse_final/ +dist/ +dist_*/ +>>>>>>> 1ab0a2e (Add multi-cuda wheel build) *.whl *.egg-info/ __pycache__/ *.pyc *.pyo +<<<<<<< HEAD +======= +.pytest_cache/ +>>>>>>> Stashed changes +>>>>>>> 1ab0a2e (Add multi-cuda wheel build) diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh new file mode 100755 index 00000000..af0ca61e --- /dev/null +++ b/ci/build_multi_cuda_wheel.sh @@ -0,0 +1,199 @@ +#!/bin/bash +# Apache 2.0 License +# Copyright 2024-2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +usage="Usage: $0 -py-version [additional options...]" + +source "$ci_dir/util/python/common_arg_parser.sh" +parse_python_args "$@" + +# Check if py_version was provided (this script requires it) +require_py_version "$usage" || exit 1 + +echo "Docker socket: " $(ls /var/run/docker.sock) + +# Set HOST_WORKSPACE if not already set (for local runs) +if [[ -z "${HOST_WORKSPACE:-}" ]]; then + # Get the repository root + HOST_WORKSPACE="$(cd "${ci_dir}/.." && pwd)" + echo "Setting HOST_WORKSPACE to: $HOST_WORKSPACE" +fi + +if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + # Prepare mount points etc for getting artifacts in/out of the container. + source "$ci_dir/util/artifacts/common.sh" + # Note that these mounts use the runner (not the devcontainer) filesystem for + # source directories because of docker-out-of-docker quirks. + # The workflow-job GH actions make sure that they exist before running any + # scripts. + action_mounts=$(cat < /dev/null; then + PYTHON=python +elif command -v python3 &> /dev/null; then + PYTHON=python3 +else + echo "Error: No python found" + exit 1 +fi + +# Needed for unpacking and repacking wheels. +$PYTHON -m pip install --break-system-packages wheel + +# Find the built wheels +cu12_wheel=$(find wheelhouse -name "*cu12*.whl" | head -1) +cu13_wheel=$(find wheelhouse -name "*cu13*.whl" | head -1) + +if [[ -z "$cu12_wheel" ]]; then + echo "Error: CUDA 12 wheel not found in wheelhouse/" + ls -la wheelhouse/ + exit 1 +fi + +if [[ -z "$cu13_wheel" ]]; then + echo "Error: CUDA 13 wheel not found in wheelhouse/" + ls -la wheelhouse/ + exit 1 +fi + +echo "Found CUDA 12 wheel: $cu12_wheel" +echo "Found CUDA 13 wheel: $cu13_wheel" + +# Convert to absolute paths before changing directory +cu12_wheel=$(readlink -f "$cu12_wheel") +cu13_wheel=$(readlink -f "$cu13_wheel") + +# Merge the wheels manually (similar to CCCL) +mkdir -p wheelhouse_merged +cd wheelhouse_merged + +# Unpack CUDA 12 wheel (this will be our base) +$PYTHON -m wheel unpack "$cu12_wheel" +base_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1) + +# Unpack CUDA 13 wheel into a temporary subdirectory +mkdir cu13_tmp +cd cu13_tmp +$PYTHON -m wheel unpack "$cu13_wheel" +cu13_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1) + +# Copy the CUDA 13 extension into the base wheel +cp "$cu13_dir"/cuda/bench/_nvbench_cu13*.so "../$base_dir/cuda/bench/" + +# Go back and clean up +cd .. +rm -rf cu13_tmp + +# Repack the merged wheel +$PYTHON -m wheel pack "$base_dir" + +cd .. + +# Install auditwheel and repair the merged wheel +$PYTHON -m pip install --break-system-packages patchelf auditwheel +for wheel in wheelhouse_merged/pynvbench-*.whl; do + echo "Repairing merged wheel: $wheel" + $PYTHON -m auditwheel repair \ + --exclude 'libcuda.so.1' \ + --exclude 'libnvidia-ml.so.1' \ + "$wheel" \ + --wheel-dir wheelhouse_final +done + +# Clean up intermediate files and move only the final merged wheel to wheelhouse +rm -rf wheelhouse/* # Clean existing wheelhouse +mkdir -p wheelhouse + +# Move only the final repaired merged wheel +if ls wheelhouse_final/pynvbench-*.whl 1> /dev/null 2>&1; then + mv wheelhouse_final/pynvbench-*.whl wheelhouse/ + echo "Final merged wheel moved to wheelhouse" +else + echo "No final repaired wheel found, moving unrepaired merged wheel" + mv wheelhouse_merged/pynvbench-*.whl wheelhouse/ +fi + +# Clean up temporary directories +rm -rf wheelhouse_merged wheelhouse_final + +echo "Final wheels in wheelhouse:" +ls -la wheelhouse/ + +if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + wheel_artifact_name="$(ci/util/workflow/get_wheel_artifact_name.sh)" + ci/util/artifacts/upload.sh $wheel_artifact_name 'wheelhouse/.*' +fi diff --git a/ci/build_pynvbench_wheel_for_cuda.sh b/ci/build_pynvbench_wheel_for_cuda.sh new file mode 100755 index 00000000..0e3d4b7d --- /dev/null +++ b/ci/build_pynvbench_wheel_for_cuda.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# Apache 2.0 License +# Copyright 2024-2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euxo pipefail + +# Target script for `docker run` command in build_multi_cuda_wheel.sh +# This script builds a single wheel for the container's CUDA version +# The /workspace pathnames are hard-wired here. + +# Install GCC 13 toolset (needed for the build) +/workspace/ci/util/retry.sh 5 30 dnf -y install gcc-toolset-13-gcc gcc-toolset-13-gcc-c++ +echo -e "#!/bin/bash\nsource /opt/rh/gcc-toolset-13/enable" >/etc/profile.d/enable_devtools.sh +source /etc/profile.d/enable_devtools.sh + +# Check what's available +which gcc +gcc --version +which nvcc +nvcc --version + +# Set up Python environment +source /workspace/ci/pyenv_helper.sh +setup_python_env "${py_version}" +which python +python --version +echo "Done setting up python env" + +# Ensure we have full git history for setuptools_scm +if $(git rev-parse --is-shallow-repository); then + git fetch --unshallow +fi + +cd /workspace/python + +# Determine CUDA version from nvcc +cuda_version=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+' | cut -d. -f1) +echo "Detected CUDA version: ${cuda_version}" + +# Configure compilers +export CXX="$(which g++)" +export CUDACXX="$(which nvcc)" +export CUDAHOSTCXX="$(which g++)" + +# Set CUDA suffix for extension naming +export PYNVBENCH_CUDA_SUFFIX="_cu${cuda_version}" + +# Build the wheel +python -m pip wheel --no-deps --verbose --wheel-dir dist . + +# Rename wheel to include CUDA version suffix +for wheel in dist/pynvbench-*.whl; do + if [[ -f "$wheel" ]]; then + base_name=$(basename "$wheel" .whl) + new_name="${base_name}.cu${cuda_version}.whl" + mv "$wheel" "dist/${new_name}" + echo "Renamed wheel to: ${new_name}" + fi +done + +# Move wheel to output directory +mkdir -p /workspace/wheelhouse +mv dist/pynvbench-*.cu*.whl /workspace/wheelhouse/ diff --git a/ci/test_pynvbench.sh b/ci/test_pynvbench.sh index b021f0b5..ec3e4573 100755 --- a/ci/test_pynvbench.sh +++ b/ci/test_pynvbench.sh @@ -40,11 +40,13 @@ if [[ -z "$cuda_version" ]]; then exit 1 fi -# Map cuda_version to full version +# Map cuda_version to full version and set CUDA extra if [[ "$cuda_version" == "12" ]]; then cuda_full_version="12.9.1" + cuda_extra="[cu12]" elif [[ "$cuda_version" == "13" ]]; then cuda_full_version="13.0.1" + cuda_extra="[cu13]" else echo "Error: Unsupported CUDA version: $cuda_version" exit 1 @@ -66,9 +68,11 @@ echo "::group::🧪 Testing CUDA ${cuda_version} wheel on ${cuda_image}" docker pull $cuda_image docker run --rm -i \ --workdir /workspace \ + --gpus all \ --mount type=bind,source=$(pwd),target=/workspace/ \ --env py_version=${py_version} \ --env cuda_version=${cuda_version} \ + --env cuda_extra="${cuda_extra}" \ $cuda_image \ /workspace/ci/test_pynvbench_inner.sh # Prevent GHA runners from exhausting available storage with leftover images: diff --git a/ci/test_pynvbench_inner.sh b/ci/test_pynvbench_inner.sh index 80748fa9..02d8ee0b 100755 --- a/ci/test_pynvbench_inner.sh +++ b/ci/test_pynvbench_inner.sh @@ -24,19 +24,34 @@ echo "CUDA version: $(nvcc --version | grep release)" # Wheel should be in /workspace/wheelhouse (downloaded by workflow or built locally) WHEELHOUSE_DIR="/workspace/wheelhouse" +<<<<<<< HEAD # Find and install pynvbench wheel # Look for .cu${cuda_version} in the version string (e.g., pynvbench-0.0.1.dev1+g123.cu12-...) PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*.cu${cuda_version}-*.whl 2>/dev/null | head -1)" if [[ -z "$PYNVBENCH_WHEEL_PATH" ]]; then echo "Error: No pynvbench wheel found in ${WHEELHOUSE_DIR}" echo "Looking for: pynvbench-*.cu${cuda_version}-*.whl" +======= +# Find the pynvbench wheel (multi-CUDA wheel, no CUDA version in filename) +PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*.whl 2>/dev/null | head -1)" +if [[ -z "$PYNVBENCH_WHEEL_PATH" ]]; then + echo "Error: No pynvbench wheel found in ${WHEELHOUSE_DIR}" +>>>>>>> 1ab0a2e (Add multi-cuda wheel build) echo "Contents of ${WHEELHOUSE_DIR}:" ls -la ${WHEELHOUSE_DIR}/ || true exit 1 fi +<<<<<<< HEAD echo "Installing wheel: $PYNVBENCH_WHEEL_PATH" python -m pip install "${PYNVBENCH_WHEEL_PATH}[test]" +======= +# Determine which CUDA extra to install (defaults to cu12 if not specified) +CUDA_EXTRA="${cuda_extra:-[cu${cuda_version}]}" + +echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${CUDA_EXTRA}" +python -m pip install "${PYNVBENCH_WHEEL_PATH}${CUDA_EXTRA},test" +>>>>>>> 1ab0a2e (Add multi-cuda wheel build) # Run tests cd "/workspace/python/test/" diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b18f7ef1..c0dda5a0 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -25,15 +25,28 @@ CPMAddPackage( CPMAddPackage("gh:pybind/pybind11@3.0.0") -pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp) -target_link_libraries(_nvbench PUBLIC nvbench::nvbench) -target_link_libraries(_nvbench PRIVATE CUDA::cudart_static) +# Determine CUDA major version for extension naming +string(REGEX MATCH "^([0-9]+)" CUDA_VERSION_MAJOR "${CUDAToolkit_VERSION}") -set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN") -set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON) -set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON) +# Allow override of extension suffix via environment variable (for multi-CUDA builds) +if(DEFINED ENV{PYNVBENCH_CUDA_SUFFIX}) + set(NVBENCH_EXTENSION_SUFFIX $ENV{PYNVBENCH_CUDA_SUFFIX}) +else() + set(NVBENCH_EXTENSION_SUFFIX "_cu${CUDA_VERSION_MAJOR}") +endif() -install(TARGETS _nvbench DESTINATION cuda/bench) +set(NVBENCH_EXTENSION_NAME "_nvbench${NVBENCH_EXTENSION_SUFFIX}") +message(STATUS "Building extension: ${NVBENCH_EXTENSION_NAME}") + +pybind11_add_module(${NVBENCH_EXTENSION_NAME} MODULE src/py_nvbench.cpp) +target_link_libraries(${NVBENCH_EXTENSION_NAME} PUBLIC nvbench::nvbench) +target_link_libraries(${NVBENCH_EXTENSION_NAME} PRIVATE CUDA::cudart_static) + +set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INSTALL_RPATH "$ORIGIN") +set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON) +set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) + +install(TARGETS ${NVBENCH_EXTENSION_NAME} DESTINATION cuda/bench) # Determine target that nvbench::nvbench is an alias of, # necessary because ALIAS targets cannot be installed diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py index e1d2282a..d2439fe3 100644 --- a/python/cuda/bench/__init__.py +++ b/python/cuda/bench/__init__.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import importlib import importlib.metadata import warnings @@ -31,29 +32,57 @@ f"Version is set to fall-back value '{__version__}' instead." ) + +# Detect CUDA runtime version and load appropriate extension +def _get_cuda_major_version(): + """Detect the CUDA runtime major version.""" + try: + from cuda import cuda as cuda_bindings + + err, version = cuda_bindings.cuRuntimeGetVersion() + if err != cuda_bindings.cudaError_t.cudaSuccess: + raise RuntimeError(f"Failed to get CUDA runtime version: {err}") + # Version is encoded as (major * 1000) + (minor * 10) + major = version // 1000 + return major + except ImportError: + raise ImportError( + "cuda-bindings is required for runtime CUDA version detection. " + "Install with: pip install pynvbench[cu12] or pip install pynvbench[cu13]" + ) + + +_cuda_major = _get_cuda_major_version() +_extension_name = f"_nvbench_cu{_cuda_major}" + +try: + _nvbench_module = importlib.import_module(f"cuda.bench.{_extension_name}") +except ImportError as e: + raise ImportError( + f"No pynvbench extension found for CUDA {_cuda_major}.x. " + f"This wheel may not include support for your CUDA version. " + f"Supported CUDA versions: 12, 13. " + f"Original error: {e}" + ) + +# Load required NVIDIA libraries for libname in ("cupti", "nvperf_target", "nvperf_host"): load_nvidia_dynamic_lib(libname) -from cuda.bench._nvbench import ( # noqa: E402 - Benchmark as Benchmark, -) -from cuda.bench._nvbench import ( # noqa: E402 - CudaStream as CudaStream, -) -from cuda.bench._nvbench import ( # noqa: E402 - Launch as Launch, -) -from cuda.bench._nvbench import ( # noqa: E402 - NVBenchRuntimeError as NVBenchRuntimeError, -) -from cuda.bench._nvbench import ( # noqa: E402 - State as State, -) -from cuda.bench._nvbench import ( # noqa: E402 - register as register, -) -from cuda.bench._nvbench import ( # noqa: E402 - run_all_benchmarks as run_all_benchmarks, -) +# Import and expose all public symbols from the CUDA-specific extension +Benchmark = _nvbench_module.Benchmark +CudaStream = _nvbench_module.CudaStream +Launch = _nvbench_module.Launch +NVBenchRuntimeError = _nvbench_module.NVBenchRuntimeError +State = _nvbench_module.State +register = _nvbench_module.register +run_all_benchmarks = _nvbench_module.run_all_benchmarks -del load_nvidia_dynamic_lib +# Clean up internal symbols +del ( + load_nvidia_dynamic_lib, + _nvbench_module, + _cuda_major, + _extension_name, + _get_cuda_major_version, +) diff --git a/python/pyproject.toml b/python/pyproject.toml index d5356705..9d2baebb 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -22,22 +22,20 @@ classifiers = [ ] requires-python = ">=3.10" dependencies = [ - # pathfinder + # pathfinder for finding CUDA libraries "cuda-pathfinder", - - # Library expects to find shared libraries - # libcupti, libnvperf_target, libnvperf_host - # pathfinder is used to find it in the Python layout - "nvidia-cuda-cupti-cu12", - - # The shared library - # libnvidia-ml must be installed system-wide - # (Debian package provider: libnvidia-compute) ] dynamic = ["version"] readme = { file = "README.md", content-type = "text/markdown" } [project.optional-dependencies] +# CUDA 12.x dependencies +cu12 = ["cuda-bindings>=12.0.0,<13.0.0", "nvidia-cuda-cupti-cu12"] + +# CUDA 13.x dependencies +cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti-cu13"] + +# Test dependencies (using CUDA 12 by default) test = ["pytest", "cupy-cuda12x", "numba"] [project.urls] @@ -51,7 +49,7 @@ build-dir = "build/{wheel_tag}" [tool.scikit-build.cmake] version = ">=3.30.4" -args = [] +args = ["-DNVBench_ENABLE_NVML=OFF", "-DNVBench_ENABLE_CUPTI=OFF"] build-type = "Release" source-dir = "." From a0206000f4e540f5ac712e37bf95cd0bb86e8104 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 08:01:48 -0500 Subject: [PATCH 02/20] Update .gitignore --- .gitignore | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/.gitignore b/.gitignore index a5c769c6..eec6d226 100644 --- a/.gitignore +++ b/.gitignore @@ -8,13 +8,6 @@ cmake-build-* *~ compile_commands.json CMakeUserPresets.json -<<<<<<< HEAD - -# Python wheel builds -wheelhouse/ -======= -<<<<<<< Updated upstream -======= # Python wheel builds wheelhouse/ @@ -22,14 +15,9 @@ wheelhouse_merged/ wheelhouse_final/ dist/ dist_*/ ->>>>>>> 1ab0a2e (Add multi-cuda wheel build) *.whl *.egg-info/ __pycache__/ *.pyc *.pyo -<<<<<<< HEAD -======= .pytest_cache/ ->>>>>>> Stashed changes ->>>>>>> 1ab0a2e (Add multi-cuda wheel build) From 18ecf118643e1ee84052eb9800e68a9c5cfa9d8e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 08:05:34 -0500 Subject: [PATCH 03/20] No need for the artifacts utilities --- ci/build_multi_cuda_wheel.sh | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh index af0ca61e..bd5a953b 100755 --- a/ci/build_multi_cuda_wheel.sh +++ b/ci/build_multi_cuda_wheel.sh @@ -37,24 +37,6 @@ if [[ -z "${HOST_WORKSPACE:-}" ]]; then echo "Setting HOST_WORKSPACE to: $HOST_WORKSPACE" fi -if [[ -n "${GITHUB_ACTIONS:-}" ]]; then - # Prepare mount points etc for getting artifacts in/out of the container. - source "$ci_dir/util/artifacts/common.sh" - # Note that these mounts use the runner (not the devcontainer) filesystem for - # source directories because of docker-out-of-docker quirks. - # The workflow-job GH actions make sure that they exist before running any - # scripts. - action_mounts=$(cat < Date: Thu, 4 Dec 2025 10:03:26 -0500 Subject: [PATCH 04/20] Updates to correctly handle multi-CUDA wheels --- ci/test_pynvbench.sh | 4 ++-- ci/test_pynvbench_inner.sh | 27 +++++++++------------------ python/CMakeLists.txt | 3 ++- python/cuda/bench/__init__.py | 14 ++++++++------ python/src/py_nvbench.cpp | 6 +++++- 5 files changed, 26 insertions(+), 28 deletions(-) diff --git a/ci/test_pynvbench.sh b/ci/test_pynvbench.sh index ec3e4573..53451515 100755 --- a/ci/test_pynvbench.sh +++ b/ci/test_pynvbench.sh @@ -43,10 +43,10 @@ fi # Map cuda_version to full version and set CUDA extra if [[ "$cuda_version" == "12" ]]; then cuda_full_version="12.9.1" - cuda_extra="[cu12]" + cuda_extra="cu12" elif [[ "$cuda_version" == "13" ]]; then cuda_full_version="13.0.1" - cuda_extra="[cu13]" + cuda_extra="cu13" else echo "Error: Unsupported CUDA version: $cuda_version" exit 1 diff --git a/ci/test_pynvbench_inner.sh b/ci/test_pynvbench_inner.sh index 02d8ee0b..cc0e7527 100755 --- a/ci/test_pynvbench_inner.sh +++ b/ci/test_pynvbench_inner.sh @@ -24,34 +24,25 @@ echo "CUDA version: $(nvcc --version | grep release)" # Wheel should be in /workspace/wheelhouse (downloaded by workflow or built locally) WHEELHOUSE_DIR="/workspace/wheelhouse" -<<<<<<< HEAD -# Find and install pynvbench wheel -# Look for .cu${cuda_version} in the version string (e.g., pynvbench-0.0.1.dev1+g123.cu12-...) -PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*.cu${cuda_version}-*.whl 2>/dev/null | head -1)" +# Find the pynvbench wheel (multi-CUDA wheel) +# Prefer manylinux wheels, fall back to any wheel +PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*manylinux*.whl 2>/dev/null | head -1)" if [[ -z "$PYNVBENCH_WHEEL_PATH" ]]; then - echo "Error: No pynvbench wheel found in ${WHEELHOUSE_DIR}" - echo "Looking for: pynvbench-*.cu${cuda_version}-*.whl" -======= -# Find the pynvbench wheel (multi-CUDA wheel, no CUDA version in filename) -PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*.whl 2>/dev/null | head -1)" + PYNVBENCH_WHEEL_PATH="$(ls ${WHEELHOUSE_DIR}/pynvbench-*.whl 2>/dev/null | head -1)" +fi + if [[ -z "$PYNVBENCH_WHEEL_PATH" ]]; then echo "Error: No pynvbench wheel found in ${WHEELHOUSE_DIR}" ->>>>>>> 1ab0a2e (Add multi-cuda wheel build) echo "Contents of ${WHEELHOUSE_DIR}:" ls -la ${WHEELHOUSE_DIR}/ || true exit 1 fi -<<<<<<< HEAD -echo "Installing wheel: $PYNVBENCH_WHEEL_PATH" -python -m pip install "${PYNVBENCH_WHEEL_PATH}[test]" -======= # Determine which CUDA extra to install (defaults to cu12 if not specified) -CUDA_EXTRA="${cuda_extra:-[cu${cuda_version}]}" +CUDA_EXTRA="${cuda_extra:-cu${cuda_version}}" -echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${CUDA_EXTRA}" -python -m pip install "${PYNVBENCH_WHEEL_PATH}${CUDA_EXTRA},test" ->>>>>>> 1ab0a2e (Add multi-cuda wheel build) +echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${CUDA_EXTRA},test" +python -m pip install "${PYNVBENCH_WHEEL_PATH}[${CUDA_EXTRA},test]" # Run tests cd "/workspace/python/test/" diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index c0dda5a0..20cd71cc 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -39,8 +39,9 @@ set(NVBENCH_EXTENSION_NAME "_nvbench${NVBENCH_EXTENSION_SUFFIX}") message(STATUS "Building extension: ${NVBENCH_EXTENSION_NAME}") pybind11_add_module(${NVBENCH_EXTENSION_NAME} MODULE src/py_nvbench.cpp) +target_compile_definitions(${NVBENCH_EXTENSION_NAME} PRIVATE PYBIND11_MODULE_NAME=${NVBENCH_EXTENSION_NAME}) target_link_libraries(${NVBENCH_EXTENSION_NAME} PUBLIC nvbench::nvbench) -target_link_libraries(${NVBENCH_EXTENSION_NAME} PRIVATE CUDA::cudart_static) +target_link_libraries(${NVBENCH_EXTENSION_NAME} PRIVATE CUDA::cudart_static CUDA::cuda_driver) set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INSTALL_RPATH "$ORIGIN") set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON) diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py index d2439fe3..9c8a35ce 100644 --- a/python/cuda/bench/__init__.py +++ b/python/cuda/bench/__init__.py @@ -37,13 +37,12 @@ def _get_cuda_major_version(): """Detect the CUDA runtime major version.""" try: - from cuda import cuda as cuda_bindings + import cuda.bindings - err, version = cuda_bindings.cuRuntimeGetVersion() - if err != cuda_bindings.cudaError_t.cudaSuccess: - raise RuntimeError(f"Failed to get CUDA runtime version: {err}") - # Version is encoded as (major * 1000) + (minor * 10) - major = version // 1000 + # Get CUDA version from cuda-bindings package version + # cuda-bindings version is in format like "12.9.1" or "13.0.0" + version_str = cuda.bindings.__version__ + major = int(version_str.split(".")[0]) return major except ImportError: raise ImportError( @@ -78,6 +77,9 @@ def _get_cuda_major_version(): register = _nvbench_module.register run_all_benchmarks = _nvbench_module.run_all_benchmarks +# Expose the module as _nvbench for backward compatibility (e.g., for tests) +_nvbench = _nvbench_module + # Clean up internal symbols del ( load_nvidia_dynamic_lib, diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp index 8856e8e7..a878f5ec 100644 --- a/python/src/py_nvbench.cpp +++ b/python/src/py_nvbench.cpp @@ -230,7 +230,11 @@ std::unique_ptr global_registry{}; // If you modify these bindings, please be sure to update the // corresponding type hints in ``../cuda/nvbench/__init__.pyi`` -PYBIND11_MODULE(_nvbench, m) +#ifndef PYBIND11_MODULE_NAME +#define PYBIND11_MODULE_NAME _nvbench +#endif + +PYBIND11_MODULE(PYBIND11_MODULE_NAME, m) { // == STEP 1 // Set environment variable CUDA_MODULE_LOADING=EAGER From f03d18ad9ac1c21d1bceb64d44ada7474f164785 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 10:30:32 -0500 Subject: [PATCH 05/20] Small fix --- .github/workflows/build-and-test-python-wheels.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml index 377e0d6a..c58df68c 100644 --- a/.github/workflows/build-and-test-python-wheels.yml +++ b/.github/workflows/build-and-test-python-wheels.yml @@ -76,10 +76,10 @@ jobs: # Use the same rapidsai/ci-wheel Docker image as build if [[ "${{ matrix.cuda }}" == "12" ]]; then cuda_full_version="12.9.1" - cuda_extra="[cu12]" + cuda_extra="cu12" else cuda_full_version="13.0.1" - cuda_extra="[cu13]" + cuda_extra="cu13" fi docker run --rm \ From edd31c764d32c3608c0a1bc1339fd335181017b8 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 10:34:20 -0500 Subject: [PATCH 06/20] Reduce CI matrix --- .../build-and-test-python-wheels.yml | 6 +- .github/workflows/build-wheels.yml | 2 +- .github/workflows/pr.yml | 102 +++++++++--------- 3 files changed, 55 insertions(+), 55 deletions(-) diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml index c58df68c..f856c650 100644 --- a/.github/workflows/build-and-test-python-wheels.yml +++ b/.github/workflows/build-and-test-python-wheels.yml @@ -19,7 +19,7 @@ jobs: strategy: fail-fast: false matrix: - python: ['3.10', '3.11', '3.12', '3.13'] + python: ['3.13'] steps: - name: Checkout repository @@ -55,8 +55,8 @@ jobs: strategy: fail-fast: false matrix: - cuda: ['12', '13'] - python: ['3.10', '3.11', '3.12', '3.13'] + cuda: ['13'] + python: ['3.13'] steps: - name: Checkout repository diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 4ae97491..993e60fa 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -18,7 +18,7 @@ jobs: strategy: fail-fast: false matrix: - python: ['3.10', '3.11', '3.12', '3.13'] + python: ['3.13'] steps: - name: Checkout repository diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index c42cda01..3653f510 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -35,46 +35,46 @@ permissions: pull-requests: read jobs: - compute-matrix: - name: Compute matrix - runs-on: ubuntu-latest - outputs: - DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}} - PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}} - PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}} - base_sha: ${{ steps.export-pr-info.outputs.base_sha }} - pr_number: ${{ steps.export-pr-info.outputs.pr_number }} - steps: - - name: Checkout repo - uses: actions/checkout@v4 - - name: Lookup PR info - id: get-pr-info - uses: nv-gha-runners/get-pr-info@main - - name: Export PR info - id: export-pr-info - run: | - echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}" - echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}" - - name: Compute matrix outputs - id: set-outputs - run: | - .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request + # compute-matrix: + # name: Compute matrix + # runs-on: ubuntu-latest + # outputs: + # DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}} + # PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}} + # PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}} + # base_sha: ${{ steps.export-pr-info.outputs.base_sha }} + # pr_number: ${{ steps.export-pr-info.outputs.pr_number }} + # steps: + # - name: Checkout repo + # uses: actions/checkout@v4 + # - name: Lookup PR info + # id: get-pr-info + # uses: nv-gha-runners/get-pr-info@main + # - name: Export PR info + # id: export-pr-info + # run: | + # echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}" + # echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}" + # - name: Compute matrix outputs + # id: set-outputs + # run: | + # .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request - nvbench: - name: NVBench CUDA${{ matrix.cuda_host_combination }} - permissions: - id-token: write - contents: read - needs: compute-matrix - uses: ./.github/workflows/dispatch-build-and-test.yml - strategy: - fail-fast: false - matrix: - cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }} - with: - project_name: "nvbench" - per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }} - devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }} + # nvbench: + # name: NVBench CUDA${{ matrix.cuda_host_combination }} + # permissions: + # id-token: write + # contents: read + # needs: compute-matrix + # uses: ./.github/workflows/dispatch-build-and-test.yml + # strategy: + # fail-fast: false + # matrix: + # cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }} + # with: + # project_name: "nvbench" + # per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }} + # devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }} python-wheels: name: Python Wheels @@ -83,16 +83,16 @@ jobs: contents: read uses: ./.github/workflows/build-and-test-python-wheels.yml - verify-devcontainers: - name: Verify Dev Containers - if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }} - needs: compute-matrix - permissions: - id-token: write - contents: read - uses: ./.github/workflows/verify-devcontainers.yml - with: - base_sha: ${{ needs.compute-matrix.outputs.base_sha }} + # verify-devcontainers: + # name: Verify Dev Containers + # if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }} + # needs: compute-matrix + # permissions: + # id-token: write + # contents: read + # uses: ./.github/workflows/verify-devcontainers.yml + # with: + # base_sha: ${{ needs.compute-matrix.outputs.base_sha }} # This job is the final job that runs after all other jobs and is used for branch protection status checks. # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks @@ -102,9 +102,9 @@ jobs: name: CI if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success needs: - - nvbench + # - nvbench - python-wheels - - verify-devcontainers + # - verify-devcontainers steps: - name: Check status of all precursor jobs if: >- From d178a089324e0aac866ecdf7a579bb8626f4d290 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 10:54:35 -0500 Subject: [PATCH 07/20] =?UTF-8?q?Special=20handling=20for=20nvidia-cuda-cu?= =?UTF-8?q?pti=20for=20cuda=2013=20=F0=9F=A4=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 9d2baebb..f238fa6b 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -33,7 +33,7 @@ readme = { file = "README.md", content-type = "text/markdown" } cu12 = ["cuda-bindings>=12.0.0,<13.0.0", "nvidia-cuda-cupti-cu12"] # CUDA 13.x dependencies -cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti-cu13"] +cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti>=13.0"] # Test dependencies (using CUDA 12 by default) test = ["pytest", "cupy-cuda12x", "numba"] From fd2de8c9b0ddf5a6babfc79ca73b1ffcfd47a60d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 12:36:32 -0500 Subject: [PATCH 08/20] Fix issue causing segfault --- ci/build_multi_cuda_wheel.sh | 12 +++++++++++- ci/test_pynvbench_inner.sh | 5 +++-- python/pyproject.toml | 8 +++++++- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh index bd5a953b..fe7d4c9e 100755 --- a/ci/build_multi_cuda_wheel.sh +++ b/ci/build_multi_cuda_wheel.sh @@ -123,14 +123,24 @@ cd wheelhouse_merged $PYTHON -m wheel unpack "$cu12_wheel" base_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1) +# Rename libnvbench.so to libnvbench_cu12.so in the base (CUDA 12) wheel +mv "$base_dir/cuda/bench/libnvbench.so" "$base_dir/cuda/bench/libnvbench_cu12.so" + +# Update _nvbench_cu12.so to link to libnvbench_cu12.so instead of libnvbench.so +patchelf --replace-needed libnvbench.so libnvbench_cu12.so "$base_dir"/cuda/bench/_nvbench_cu12*.so + # Unpack CUDA 13 wheel into a temporary subdirectory mkdir cu13_tmp cd cu13_tmp $PYTHON -m wheel unpack "$cu13_wheel" cu13_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1) -# Copy the CUDA 13 extension into the base wheel +# Copy the CUDA 13 extension AND library into the base wheel cp "$cu13_dir"/cuda/bench/_nvbench_cu13*.so "../$base_dir/cuda/bench/" +cp "$cu13_dir"/cuda/bench/libnvbench.so "../$base_dir/cuda/bench/libnvbench_cu13.so" + +# Update _nvbench_cu13.so to link to libnvbench_cu13.so instead of libnvbench.so +patchelf --replace-needed libnvbench.so libnvbench_cu13.so "../$base_dir"/cuda/bench/_nvbench_cu13*.so # Go back and clean up cd .. diff --git a/ci/test_pynvbench_inner.sh b/ci/test_pynvbench_inner.sh index cc0e7527..4a7bd1c1 100755 --- a/ci/test_pynvbench_inner.sh +++ b/ci/test_pynvbench_inner.sh @@ -40,9 +40,10 @@ fi # Determine which CUDA extra to install (defaults to cu12 if not specified) CUDA_EXTRA="${cuda_extra:-cu${cuda_version}}" +TEST_EXTRA="test-cu${cuda_version}" -echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${CUDA_EXTRA},test" -python -m pip install "${PYNVBENCH_WHEEL_PATH}[${CUDA_EXTRA},test]" +echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${CUDA_EXTRA},${TEST_EXTRA}" +python -m pip install "${PYNVBENCH_WHEEL_PATH}[${CUDA_EXTRA},${TEST_EXTRA}]" # Run tests cd "/workspace/python/test/" diff --git a/python/pyproject.toml b/python/pyproject.toml index f238fa6b..08161210 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -35,7 +35,13 @@ cu12 = ["cuda-bindings>=12.0.0,<13.0.0", "nvidia-cuda-cupti-cu12"] # CUDA 13.x dependencies cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti>=13.0"] -# Test dependencies (using CUDA 12 by default) +# Test dependencies for CUDA 12 +test-cu12 = ["pynvbench[cu12]", "pytest", "cupy-cuda12x", "numba"] + +# Test dependencies for CUDA 13 +test-cu13 = ["pynvbench[cu13]", "pytest", "cupy-cuda13x", "numba"] + +# Generic test dependencies (defaults to CUDA 12) test = ["pytest", "cupy-cuda12x", "numba"] [project.urls] From 1e2ef0fe04fb89be49363c9313cfeaf2aa2c23bc Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 13:17:13 -0500 Subject: [PATCH 09/20] Revert "Reduce CI matrix" This reverts commit edd31c764d32c3608c0a1bc1339fd335181017b8. --- .../build-and-test-python-wheels.yml | 6 +- .github/workflows/build-wheels.yml | 2 +- .github/workflows/pr.yml | 102 +++++++++--------- 3 files changed, 55 insertions(+), 55 deletions(-) diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml index f856c650..c58df68c 100644 --- a/.github/workflows/build-and-test-python-wheels.yml +++ b/.github/workflows/build-and-test-python-wheels.yml @@ -19,7 +19,7 @@ jobs: strategy: fail-fast: false matrix: - python: ['3.13'] + python: ['3.10', '3.11', '3.12', '3.13'] steps: - name: Checkout repository @@ -55,8 +55,8 @@ jobs: strategy: fail-fast: false matrix: - cuda: ['13'] - python: ['3.13'] + cuda: ['12', '13'] + python: ['3.10', '3.11', '3.12', '3.13'] steps: - name: Checkout repository diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 993e60fa..4ae97491 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -18,7 +18,7 @@ jobs: strategy: fail-fast: false matrix: - python: ['3.13'] + python: ['3.10', '3.11', '3.12', '3.13'] steps: - name: Checkout repository diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 3653f510..c42cda01 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -35,46 +35,46 @@ permissions: pull-requests: read jobs: - # compute-matrix: - # name: Compute matrix - # runs-on: ubuntu-latest - # outputs: - # DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}} - # PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}} - # PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}} - # base_sha: ${{ steps.export-pr-info.outputs.base_sha }} - # pr_number: ${{ steps.export-pr-info.outputs.pr_number }} - # steps: - # - name: Checkout repo - # uses: actions/checkout@v4 - # - name: Lookup PR info - # id: get-pr-info - # uses: nv-gha-runners/get-pr-info@main - # - name: Export PR info - # id: export-pr-info - # run: | - # echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}" - # echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}" - # - name: Compute matrix outputs - # id: set-outputs - # run: | - # .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request + compute-matrix: + name: Compute matrix + runs-on: ubuntu-latest + outputs: + DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}} + PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}} + PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}} + base_sha: ${{ steps.export-pr-info.outputs.base_sha }} + pr_number: ${{ steps.export-pr-info.outputs.pr_number }} + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Lookup PR info + id: get-pr-info + uses: nv-gha-runners/get-pr-info@main + - name: Export PR info + id: export-pr-info + run: | + echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}" + echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}" + - name: Compute matrix outputs + id: set-outputs + run: | + .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request - # nvbench: - # name: NVBench CUDA${{ matrix.cuda_host_combination }} - # permissions: - # id-token: write - # contents: read - # needs: compute-matrix - # uses: ./.github/workflows/dispatch-build-and-test.yml - # strategy: - # fail-fast: false - # matrix: - # cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }} - # with: - # project_name: "nvbench" - # per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }} - # devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }} + nvbench: + name: NVBench CUDA${{ matrix.cuda_host_combination }} + permissions: + id-token: write + contents: read + needs: compute-matrix + uses: ./.github/workflows/dispatch-build-and-test.yml + strategy: + fail-fast: false + matrix: + cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }} + with: + project_name: "nvbench" + per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }} + devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }} python-wheels: name: Python Wheels @@ -83,16 +83,16 @@ jobs: contents: read uses: ./.github/workflows/build-and-test-python-wheels.yml - # verify-devcontainers: - # name: Verify Dev Containers - # if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }} - # needs: compute-matrix - # permissions: - # id-token: write - # contents: read - # uses: ./.github/workflows/verify-devcontainers.yml - # with: - # base_sha: ${{ needs.compute-matrix.outputs.base_sha }} + verify-devcontainers: + name: Verify Dev Containers + if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }} + needs: compute-matrix + permissions: + id-token: write + contents: read + uses: ./.github/workflows/verify-devcontainers.yml + with: + base_sha: ${{ needs.compute-matrix.outputs.base_sha }} # This job is the final job that runs after all other jobs and is used for branch protection status checks. # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks @@ -102,9 +102,9 @@ jobs: name: CI if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success needs: - # - nvbench + - nvbench - python-wheels - # - verify-devcontainers + - verify-devcontainers steps: - name: Check status of all precursor jobs if: >- From d54f264f23f350f805d3eed5eefce96e4ec7be6c Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 16:08:27 -0500 Subject: [PATCH 10/20] Try to simplify the approach --- ci/build_multi_cuda_wheel.sh | 31 ++++++++++++++-------------- ci/build_pynvbench_wheel_for_cuda.sh | 15 +------------- python/CMakeLists.txt | 18 ++++++---------- python/cuda/bench/__init__.py | 10 ++++++--- 4 files changed, 29 insertions(+), 45 deletions(-) diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh index fe7d4c9e..a91c77fb 100755 --- a/ci/build_multi_cuda_wheel.sh +++ b/ci/build_multi_cuda_wheel.sh @@ -92,9 +92,9 @@ fi # Needed for unpacking and repacking wheels. $PYTHON -m pip install --break-system-packages wheel -# Find the built wheels -cu12_wheel=$(find wheelhouse -name "*cu12*.whl" | head -1) -cu13_wheel=$(find wheelhouse -name "*cu13*.whl" | head -1) +# Find the built wheels (they no longer have cu12/cu13 suffix in the name) +cu12_wheel=$(find wheelhouse -name "pynvbench-*.whl" | head -1) +cu13_wheel=$(find wheelhouse -name "pynvbench-*.whl" | tail -1) if [[ -z "$cu12_wheel" ]]; then echo "Error: CUDA 12 wheel not found in wheelhouse/" @@ -108,6 +108,12 @@ if [[ -z "$cu13_wheel" ]]; then exit 1 fi +if [[ "$cu12_wheel" == "$cu13_wheel" ]]; then + echo "Error: Only one wheel found, expected two (CUDA 12 and CUDA 13)" + ls -la wheelhouse/ + exit 1 +fi + echo "Found CUDA 12 wheel: $cu12_wheel" echo "Found CUDA 13 wheel: $cu13_wheel" @@ -123,36 +129,29 @@ cd wheelhouse_merged $PYTHON -m wheel unpack "$cu12_wheel" base_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1) -# Rename libnvbench.so to libnvbench_cu12.so in the base (CUDA 12) wheel -mv "$base_dir/cuda/bench/libnvbench.so" "$base_dir/cuda/bench/libnvbench_cu12.so" - -# Update _nvbench_cu12.so to link to libnvbench_cu12.so instead of libnvbench.so -patchelf --replace-needed libnvbench.so libnvbench_cu12.so "$base_dir"/cuda/bench/_nvbench_cu12*.so - # Unpack CUDA 13 wheel into a temporary subdirectory mkdir cu13_tmp cd cu13_tmp $PYTHON -m wheel unpack "$cu13_wheel" cu13_dir=$(find . -maxdepth 1 -type d -name "pynvbench-*" | head -1) -# Copy the CUDA 13 extension AND library into the base wheel -cp "$cu13_dir"/cuda/bench/_nvbench_cu13*.so "../$base_dir/cuda/bench/" -cp "$cu13_dir"/cuda/bench/libnvbench.so "../$base_dir/cuda/bench/libnvbench_cu13.so" - -# Update _nvbench_cu13.so to link to libnvbench_cu13.so instead of libnvbench.so -patchelf --replace-needed libnvbench.so libnvbench_cu13.so "../$base_dir"/cuda/bench/_nvbench_cu13*.so +# Copy the cu13/ directory from CUDA 13 wheel into the base wheel +cp -r "$cu13_dir"/cuda/bench/cu13 "../$base_dir/cuda/bench/" # Go back and clean up cd .. rm -rf cu13_tmp +# Remove RECORD file to let wheel recreate it +rm -f "$base_dir"/*.dist-info/RECORD + # Repack the merged wheel $PYTHON -m wheel pack "$base_dir" cd .. # Install auditwheel and repair the merged wheel -$PYTHON -m pip install --break-system-packages patchelf auditwheel +$PYTHON -m pip install --break-system-packages auditwheel for wheel in wheelhouse_merged/pynvbench-*.whl; do echo "Repairing merged wheel: $wheel" $PYTHON -m auditwheel repair \ diff --git a/ci/build_pynvbench_wheel_for_cuda.sh b/ci/build_pynvbench_wheel_for_cuda.sh index 0e3d4b7d..79ebbd32 100755 --- a/ci/build_pynvbench_wheel_for_cuda.sh +++ b/ci/build_pynvbench_wheel_for_cuda.sh @@ -56,22 +56,9 @@ export CXX="$(which g++)" export CUDACXX="$(which nvcc)" export CUDAHOSTCXX="$(which g++)" -# Set CUDA suffix for extension naming -export PYNVBENCH_CUDA_SUFFIX="_cu${cuda_version}" - # Build the wheel python -m pip wheel --no-deps --verbose --wheel-dir dist . -# Rename wheel to include CUDA version suffix -for wheel in dist/pynvbench-*.whl; do - if [[ -f "$wheel" ]]; then - base_name=$(basename "$wheel" .whl) - new_name="${base_name}.cu${cuda_version}.whl" - mv "$wheel" "dist/${new_name}" - echo "Renamed wheel to: ${new_name}" - fi -done - # Move wheel to output directory mkdir -p /workspace/wheelhouse -mv dist/pynvbench-*.cu*.whl /workspace/wheelhouse/ +mv dist/pynvbench-*.whl /workspace/wheelhouse/ diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 20cd71cc..e7c0c776 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -25,18 +25,12 @@ CPMAddPackage( CPMAddPackage("gh:pybind/pybind11@3.0.0") -# Determine CUDA major version for extension naming +# Determine CUDA major version for extension naming and directory structure string(REGEX MATCH "^([0-9]+)" CUDA_VERSION_MAJOR "${CUDAToolkit_VERSION}") +set(CUDA_VERSION_DIR "cu${CUDA_VERSION_MAJOR}") -# Allow override of extension suffix via environment variable (for multi-CUDA builds) -if(DEFINED ENV{PYNVBENCH_CUDA_SUFFIX}) - set(NVBENCH_EXTENSION_SUFFIX $ENV{PYNVBENCH_CUDA_SUFFIX}) -else() - set(NVBENCH_EXTENSION_SUFFIX "_cu${CUDA_VERSION_MAJOR}") -endif() - -set(NVBENCH_EXTENSION_NAME "_nvbench${NVBENCH_EXTENSION_SUFFIX}") -message(STATUS "Building extension: ${NVBENCH_EXTENSION_NAME}") +set(NVBENCH_EXTENSION_NAME "_nvbench_cu${CUDA_VERSION_MAJOR}") +message(STATUS "Building extension: ${NVBENCH_EXTENSION_NAME} for CUDA ${CUDA_VERSION_MAJOR}, output directory: ${CUDA_VERSION_DIR}") pybind11_add_module(${NVBENCH_EXTENSION_NAME} MODULE src/py_nvbench.cpp) target_compile_definitions(${NVBENCH_EXTENSION_NAME} PRIVATE PYBIND11_MODULE_NAME=${NVBENCH_EXTENSION_NAME}) @@ -47,9 +41,9 @@ set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INSTALL_RPATH "$ORIGI set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON) set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS ${NVBENCH_EXTENSION_NAME} DESTINATION cuda/bench) +install(TARGETS ${NVBENCH_EXTENSION_NAME} DESTINATION cuda/bench/${CUDA_VERSION_DIR}) # Determine target that nvbench::nvbench is an alias of, # necessary because ALIAS targets cannot be installed get_target_property(_aliased_target_name nvbench::nvbench ALIASED_TARGET) -install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench) +install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench/${CUDA_VERSION_DIR}) diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py index 9c8a35ce..0e8ce77d 100644 --- a/python/cuda/bench/__init__.py +++ b/python/cuda/bench/__init__.py @@ -52,10 +52,11 @@ def _get_cuda_major_version(): _cuda_major = _get_cuda_major_version() -_extension_name = f"_nvbench_cu{_cuda_major}" +_extra_name = f"cu{_cuda_major}" +_module_fullname = f"cuda.bench.{_extra_name}._nvbench_cu{_cuda_major}" try: - _nvbench_module = importlib.import_module(f"cuda.bench.{_extension_name}") + _nvbench_module = importlib.import_module(_module_fullname) except ImportError as e: raise ImportError( f"No pynvbench extension found for CUDA {_cuda_major}.x. " @@ -76,6 +77,8 @@ def _get_cuda_major_version(): State = _nvbench_module.State register = _nvbench_module.register run_all_benchmarks = _nvbench_module.run_all_benchmarks +test_cpp_exception = _nvbench_module.test_cpp_exception +test_py_exception = _nvbench_module.test_py_exception # Expose the module as _nvbench for backward compatibility (e.g., for tests) _nvbench = _nvbench_module @@ -85,6 +88,7 @@ def _get_cuda_major_version(): load_nvidia_dynamic_lib, _nvbench_module, _cuda_major, - _extension_name, + _extra_name, + _module_fullname, _get_cuda_major_version, ) From b63f44aba69861af8834797dc885029fb71a9327 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 16:40:44 -0500 Subject: [PATCH 11/20] More simplifications --- .../build-and-test-python-wheels.yml | 1 - .github/workflows/build-wheels.yml | 57 ------------ ci/build_multi_cuda_wheel.sh | 12 ++- ci/build_pynvbench_wheel.sh | 90 ------------------- ci/build_pynvbench_wheel_for_cuda.sh | 27 +++--- ci/build_pynvbench_wheel_inner.sh | 89 ------------------ python/CMakeLists.txt | 7 +- python/cuda/bench/__init__.py | 2 +- 8 files changed, 31 insertions(+), 254 deletions(-) delete mode 100644 .github/workflows/build-wheels.yml delete mode 100755 ci/build_pynvbench_wheel.sh delete mode 100755 ci/build_pynvbench_wheel_inner.sh diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml index c58df68c..46f391bc 100644 --- a/.github/workflows/build-and-test-python-wheels.yml +++ b/.github/workflows/build-and-test-python-wheels.yml @@ -9,7 +9,6 @@ defaults: shell: bash --noprofile --norc -euo pipefail {0} jobs: - # Build multi-CUDA wheels (one wheel per Python version, includes both CUDA 12 & 13) build-wheels: name: Build wheel (Python ${{ matrix.python }}) runs-on: ubuntu-latest diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml deleted file mode 100644 index 4ae97491..00000000 --- a/.github/workflows/build-wheels.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: Build Python Wheels (Manual) - -on: - workflow_dispatch: - workflow_call: - -defaults: - run: - shell: bash --noprofile --norc -euo pipefail {0} - -jobs: - build-wheels: - name: Build wheel (Python ${{ matrix.python }}) - runs-on: ubuntu-latest - permissions: - id-token: write - contents: read - strategy: - fail-fast: false - matrix: - python: ['3.10', '3.11', '3.12', '3.13'] - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - persist-credentials: false - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Build multi-CUDA wheel - run: | - bash ci/build_multi_cuda_wheel.sh -py-version ${{ matrix.python }} - - - name: Upload wheel artifact - uses: actions/upload-artifact@v4 - with: - name: wheel-pynvbench-py${{ matrix.python }} - path: wheelhouse/*.whl - retention-days: 7 - if-no-files-found: error - - verify-wheels: - name: Verify all wheels built successfully - if: ${{ always() }} - needs: build-wheels - runs-on: ubuntu-latest - steps: - - name: Check build results - run: | - if [[ "${{ needs.build-wheels.result }}" != "success" ]]; then - echo "Wheel builds failed!" - exit 1 - fi - echo "All wheels built successfully!" diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh index a91c77fb..a25cf39f 100755 --- a/ci/build_multi_cuda_wheel.sh +++ b/ci/build_multi_cuda_wheel.sh @@ -16,6 +16,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Build a multi-CUDA wheel for the given Python version +# This builds separate wheels for each supported CUDA major version, +# and then merges them into a single wheel containing extensions +# for all CUDA versions. At runtime, depending on the installed CUDA version, +# the correct extension will be chosen. + set -euo pipefail ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -92,9 +98,9 @@ fi # Needed for unpacking and repacking wheels. $PYTHON -m pip install --break-system-packages wheel -# Find the built wheels (they no longer have cu12/cu13 suffix in the name) -cu12_wheel=$(find wheelhouse -name "pynvbench-*.whl" | head -1) -cu13_wheel=$(find wheelhouse -name "pynvbench-*.whl" | tail -1) +# Find the built wheels (temporarily suffixed with .cu12/.cu13 to avoid collision) +cu12_wheel=$(find wheelhouse -name "*cu12*.whl" | head -1) +cu13_wheel=$(find wheelhouse -name "*cu13*.whl" | head -1) if [[ -z "$cu12_wheel" ]]; then echo "Error: CUDA 12 wheel not found in wheelhouse/" diff --git a/ci/build_pynvbench_wheel.sh b/ci/build_pynvbench_wheel.sh deleted file mode 100755 index 306891be..00000000 --- a/ci/build_pynvbench_wheel.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash -set -euo pipefail - -ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -usage="Usage: $0 -py-version -cuda-version " - -source "$ci_dir/util/python/common_arg_parser.sh" - -# Parse arguments including CUDA version -parse_python_args "$@" - -# Now parse CUDA version from remaining arguments -cuda_version="" -while [[ $# -gt 0 ]]; do - case $1 in - -cuda-version=*) - cuda_version="${1#*=}" - shift - ;; - -cuda-version) - if [[ $# -lt 2 ]]; then - echo "Error: -cuda-version requires a value" >&2 - exit 1 - fi - cuda_version="$2" - shift 2 - ;; - *) - shift - ;; - esac -done - -# Check if py_version was provided (this script requires it) -require_py_version "$usage" || exit 1 - -if [[ -z "$cuda_version" ]]; then - echo "Error: -cuda-version is required" - echo "$usage" - exit 1 -fi - -echo "Docker socket: " $(ls /var/run/docker.sock) - -# Map cuda_version to full version -if [[ "$cuda_version" == "12" ]]; then - cuda_full_version="12.9.1" - cuda_short="cu12" -elif [[ "$cuda_version" == "13" ]]; then - cuda_full_version="13.0.1" - cuda_short="cu13" -else - echo "Error: Unsupported CUDA version: $cuda_version" - exit 1 -fi - -# pynvbench must be built in a container that can produce manylinux wheels, -# and has the CUDA toolkit installed. We use the rapidsai/ci-wheel image for this. -readonly devcontainer_version=25.12 -readonly devcontainer_distro=rockylinux8 - -if [[ "$(uname -m)" == "aarch64" ]]; then - readonly cuda_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda_full_version}-${devcontainer_distro}-py${py_version}-arm64 -else - readonly cuda_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda_full_version}-${devcontainer_distro}-py${py_version} -fi - -mkdir -p wheelhouse - -echo "::group::⚒️ Building CUDA ${cuda_version} wheel on ${cuda_image}" -( - set -x - docker pull $cuda_image - docker run --rm -i \ - --workdir /workspace/python \ - --mount type=bind,source=$(pwd),target=/workspace/ \ - --env py_version=${py_version} \ - --env cuda_version=${cuda_version} \ - $cuda_image \ - /workspace/ci/build_pynvbench_wheel_inner.sh - # Prevent GHA runners from exhausting available storage with leftover images: - if [[ -n "${GITHUB_ACTIONS:-}" ]]; then - docker rmi -f $cuda_image - fi -) -echo "::endgroup::" - -echo "Wheels in wheelhouse:" -ls -la wheelhouse/ diff --git a/ci/build_pynvbench_wheel_for_cuda.sh b/ci/build_pynvbench_wheel_for_cuda.sh index 79ebbd32..c0289ca2 100755 --- a/ci/build_pynvbench_wheel_for_cuda.sh +++ b/ci/build_pynvbench_wheel_for_cuda.sh @@ -22,16 +22,16 @@ set -euxo pipefail # This script builds a single wheel for the container's CUDA version # The /workspace pathnames are hard-wired here. +# Determine CUDA version from nvcc early (needed for dev package installation) +cuda_version=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+' | cut -d. -f1) +echo "Detected CUDA version: ${cuda_version}" + # Install GCC 13 toolset (needed for the build) /workspace/ci/util/retry.sh 5 30 dnf -y install gcc-toolset-13-gcc gcc-toolset-13-gcc-c++ echo -e "#!/bin/bash\nsource /opt/rh/gcc-toolset-13/enable" >/etc/profile.d/enable_devtools.sh source /etc/profile.d/enable_devtools.sh -# Check what's available -which gcc -gcc --version -which nvcc -nvcc --version +# Note: CUDA dev packages (NVML, CUPTI, CUDART) are already installed in rapidsai/ci-wheel containers # Set up Python environment source /workspace/ci/pyenv_helper.sh @@ -47,10 +47,6 @@ fi cd /workspace/python -# Determine CUDA version from nvcc -cuda_version=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+' | cut -d. -f1) -echo "Detected CUDA version: ${cuda_version}" - # Configure compilers export CXX="$(which g++)" export CUDACXX="$(which nvcc)" @@ -59,6 +55,17 @@ export CUDAHOSTCXX="$(which g++)" # Build the wheel python -m pip wheel --no-deps --verbose --wheel-dir dist . +# Temporarily rename wheel to include CUDA version to avoid collision during multi-CUDA build +# The merge script will combine these into a single wheel +for wheel in dist/pynvbench-*.whl; do + if [[ -f "$wheel" ]]; then + base_name=$(basename "$wheel" .whl) + new_name="${base_name}.cu${cuda_version}.whl" + mv "$wheel" "dist/${new_name}" + echo "Renamed wheel to: ${new_name}" + fi +done + # Move wheel to output directory mkdir -p /workspace/wheelhouse -mv dist/pynvbench-*.whl /workspace/wheelhouse/ +mv dist/pynvbench-*.cu*.whl /workspace/wheelhouse/ diff --git a/ci/build_pynvbench_wheel_inner.sh b/ci/build_pynvbench_wheel_inner.sh deleted file mode 100755 index 543deb6a..00000000 --- a/ci/build_pynvbench_wheel_inner.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Target script for `docker run` command in build_pynvbench_wheel.sh -# The /workspace pathnames are hard-wired here. - -# Install GCC 13 toolset (needed for the build) -/workspace/ci/util/retry.sh 5 30 dnf -y install gcc-toolset-13-gcc gcc-toolset-13-gcc-c++ -echo -e "#!/bin/bash\nsource /opt/rh/gcc-toolset-13/enable" >/etc/profile.d/enable_devtools.sh -source /etc/profile.d/enable_devtools.sh - -# Check what's available -which gcc -gcc --version -which nvcc -nvcc --version - -# Set up Python environment -source /workspace/ci/pyenv_helper.sh -setup_python_env "${py_version}" -which python -python --version -echo "Done setting up python env" - -# Ensure we have full git history for setuptools_scm -if $(git rev-parse --is-shallow-repository); then - git fetch --unshallow -fi - -cd /workspace/python - -# Determine CUDA version from nvcc -cuda_major=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+' | cut -d. -f1) -echo "Detected CUDA major version: ${cuda_major}" - -# Configure compilers: -export CXX="$(which g++)" -export CUDACXX="$(which nvcc)" -export CUDAHOSTCXX="$(which g++)" - -# Build the wheel -python -m pip wheel --no-deps --verbose --wheel-dir dist . - -# Install auditwheel for manylinux compliance -python -m pip install auditwheel - -# Repair wheel to make it manylinux compliant -mkdir -p dist_repaired -for wheel in dist/pynvbench-*.whl; do - if [[ -f "$wheel" ]]; then - echo "Repairing wheel: $wheel" - python -m auditwheel repair \ - --exclude 'libcuda.so.1' \ - --exclude 'libnvidia-ml.so.1' \ - "$wheel" \ - --wheel-dir dist_repaired - fi -done - -# Rename wheel to include CUDA version suffix -mkdir -p /workspace/wheelhouse -for wheel in dist_repaired/pynvbench-*.whl; do - if [[ -f "$wheel" ]]; then - base_name=$(basename "$wheel" .whl) - # Append CUDA version to the local version identifier - # e.g., pynvbench-0.1.0.dev1+gabc123-cp312-cp312-manylinux_2_28_x86_64.whl - # becomes pynvbench-0.1.0.dev1+gabc123.cu12-cp312-cp312-manylinux_2_28_x86_64.whl - if [[ "$base_name" =~ ^(.*)-cp([0-9]+)-cp([0-9]+)-(.*) ]]; then - pkg_version="${BASH_REMATCH[1]}" - py_tag="cp${BASH_REMATCH[2]}" - abi_tag="cp${BASH_REMATCH[3]}" - platform="${BASH_REMATCH[4]}" - # If version has a local part (contains +), append .cu${cuda_major} to it - # Otherwise add +cu${cuda_major} - if [[ "$pkg_version" =~ \+ ]]; then - new_version="${pkg_version}.cu${cuda_major}" - else - new_version="${pkg_version}+cu${cuda_major}" - fi - new_name="${new_version}-${py_tag}-${abi_tag}-${platform}.whl" - mv "$wheel" "/workspace/wheelhouse/${new_name}" - echo "Renamed wheel to: ${new_name}" - else - # Fallback if regex doesn't match - mv "$wheel" /workspace/wheelhouse/ - echo "Moved wheel: $(basename $wheel)" - fi - fi -done diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index e7c0c776..f6ced68a 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -25,12 +25,13 @@ CPMAddPackage( CPMAddPackage("gh:pybind/pybind11@3.0.0") -# Determine CUDA major version for extension naming and directory structure +# Determine CUDA major version for directory structure string(REGEX MATCH "^([0-9]+)" CUDA_VERSION_MAJOR "${CUDAToolkit_VERSION}") set(CUDA_VERSION_DIR "cu${CUDA_VERSION_MAJOR}") +message(STATUS "Building extension for CUDA ${CUDA_VERSION_MAJOR}, output directory: cuda/bench/${CUDA_VERSION_DIR}") -set(NVBENCH_EXTENSION_NAME "_nvbench_cu${CUDA_VERSION_MAJOR}") -message(STATUS "Building extension: ${NVBENCH_EXTENSION_NAME} for CUDA ${CUDA_VERSION_MAJOR}, output directory: ${CUDA_VERSION_DIR}") +# Extension name is always _nvbench (directory provides CUDA version separation) +set(NVBENCH_EXTENSION_NAME "_nvbench") pybind11_add_module(${NVBENCH_EXTENSION_NAME} MODULE src/py_nvbench.cpp) target_compile_definitions(${NVBENCH_EXTENSION_NAME} PRIVATE PYBIND11_MODULE_NAME=${NVBENCH_EXTENSION_NAME}) diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py index 0e8ce77d..a47bb394 100644 --- a/python/cuda/bench/__init__.py +++ b/python/cuda/bench/__init__.py @@ -53,7 +53,7 @@ def _get_cuda_major_version(): _cuda_major = _get_cuda_major_version() _extra_name = f"cu{_cuda_major}" -_module_fullname = f"cuda.bench.{_extra_name}._nvbench_cu{_cuda_major}" +_module_fullname = f"cuda.bench.{_extra_name}._nvbench" try: _nvbench_module = importlib.import_module(_module_fullname) From 35f5272eeee9a03ca6496004ab5f484c65645875 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 16:49:09 -0500 Subject: [PATCH 12/20] Some fixes --- python/CMakeLists.txt | 19 ++++++++----------- python/pyproject.toml | 2 +- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index f6ced68a..1390caea 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -30,19 +30,16 @@ string(REGEX MATCH "^([0-9]+)" CUDA_VERSION_MAJOR "${CUDAToolkit_VERSION}") set(CUDA_VERSION_DIR "cu${CUDA_VERSION_MAJOR}") message(STATUS "Building extension for CUDA ${CUDA_VERSION_MAJOR}, output directory: cuda/bench/${CUDA_VERSION_DIR}") -# Extension name is always _nvbench (directory provides CUDA version separation) -set(NVBENCH_EXTENSION_NAME "_nvbench") +pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp) +target_compile_definitions(_nvbench PRIVATE PYBIND11_MODULE_NAME=_nvbench) +target_link_libraries(_nvbench PUBLIC nvbench::nvbench) +target_link_libraries(_nvbench PRIVATE CUDA::cudart_static CUDA::cuda_driver) -pybind11_add_module(${NVBENCH_EXTENSION_NAME} MODULE src/py_nvbench.cpp) -target_compile_definitions(${NVBENCH_EXTENSION_NAME} PRIVATE PYBIND11_MODULE_NAME=${NVBENCH_EXTENSION_NAME}) -target_link_libraries(${NVBENCH_EXTENSION_NAME} PUBLIC nvbench::nvbench) -target_link_libraries(${NVBENCH_EXTENSION_NAME} PRIVATE CUDA::cudart_static CUDA::cuda_driver) +set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN") +set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON) +set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON) -set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INSTALL_RPATH "$ORIGIN") -set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON) -set_target_properties(${NVBENCH_EXTENSION_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) - -install(TARGETS ${NVBENCH_EXTENSION_NAME} DESTINATION cuda/bench/${CUDA_VERSION_DIR}) +install(TARGETS _nvbench DESTINATION cuda/bench/${CUDA_VERSION_DIR}) # Determine target that nvbench::nvbench is an alias of, # necessary because ALIAS targets cannot be installed diff --git a/python/pyproject.toml b/python/pyproject.toml index 08161210..e752eca2 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -55,7 +55,7 @@ build-dir = "build/{wheel_tag}" [tool.scikit-build.cmake] version = ">=3.30.4" -args = ["-DNVBench_ENABLE_NVML=OFF", "-DNVBench_ENABLE_CUPTI=OFF"] +args = [] build-type = "Release" source-dir = "." From 933b951aea71b86654e1d190a3d79d2f4e16d496 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 16:50:59 -0500 Subject: [PATCH 13/20] We can just use TEST_EXTRA --- ci/test_pynvbench_inner.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/test_pynvbench_inner.sh b/ci/test_pynvbench_inner.sh index 4a7bd1c1..4303a5dd 100755 --- a/ci/test_pynvbench_inner.sh +++ b/ci/test_pynvbench_inner.sh @@ -42,8 +42,8 @@ fi CUDA_EXTRA="${cuda_extra:-cu${cuda_version}}" TEST_EXTRA="test-cu${cuda_version}" -echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${CUDA_EXTRA},${TEST_EXTRA}" -python -m pip install "${PYNVBENCH_WHEEL_PATH}[${CUDA_EXTRA},${TEST_EXTRA}]" +echo "Installing wheel: $PYNVBENCH_WHEEL_PATH with extras: ${TEST_EXTRA}" +python -m pip install "${PYNVBENCH_WHEEL_PATH}[${TEST_EXTRA}]" # Run tests cd "/workspace/python/test/" From 13cb606165d7beb364f243c0d41c72f79d534c8e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 16:52:41 -0500 Subject: [PATCH 14/20] Reduce CI matrix --- .../build-and-test-python-wheels.yml | 6 +- .github/workflows/pr.yml | 102 +++++++++--------- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml index 46f391bc..8ca1a179 100644 --- a/.github/workflows/build-and-test-python-wheels.yml +++ b/.github/workflows/build-and-test-python-wheels.yml @@ -18,7 +18,7 @@ jobs: strategy: fail-fast: false matrix: - python: ['3.10', '3.11', '3.12', '3.13'] + python: ['3.13'] steps: - name: Checkout repository @@ -54,8 +54,8 @@ jobs: strategy: fail-fast: false matrix: - cuda: ['12', '13'] - python: ['3.10', '3.11', '3.12', '3.13'] + cuda: ['13'] + python: ['3.13'] steps: - name: Checkout repository diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index c42cda01..3653f510 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -35,46 +35,46 @@ permissions: pull-requests: read jobs: - compute-matrix: - name: Compute matrix - runs-on: ubuntu-latest - outputs: - DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}} - PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}} - PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}} - base_sha: ${{ steps.export-pr-info.outputs.base_sha }} - pr_number: ${{ steps.export-pr-info.outputs.pr_number }} - steps: - - name: Checkout repo - uses: actions/checkout@v4 - - name: Lookup PR info - id: get-pr-info - uses: nv-gha-runners/get-pr-info@main - - name: Export PR info - id: export-pr-info - run: | - echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}" - echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}" - - name: Compute matrix outputs - id: set-outputs - run: | - .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request + # compute-matrix: + # name: Compute matrix + # runs-on: ubuntu-latest + # outputs: + # DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}} + # PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}} + # PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}} + # base_sha: ${{ steps.export-pr-info.outputs.base_sha }} + # pr_number: ${{ steps.export-pr-info.outputs.pr_number }} + # steps: + # - name: Checkout repo + # uses: actions/checkout@v4 + # - name: Lookup PR info + # id: get-pr-info + # uses: nv-gha-runners/get-pr-info@main + # - name: Export PR info + # id: export-pr-info + # run: | + # echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}" + # echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}" + # - name: Compute matrix outputs + # id: set-outputs + # run: | + # .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request - nvbench: - name: NVBench CUDA${{ matrix.cuda_host_combination }} - permissions: - id-token: write - contents: read - needs: compute-matrix - uses: ./.github/workflows/dispatch-build-and-test.yml - strategy: - fail-fast: false - matrix: - cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }} - with: - project_name: "nvbench" - per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }} - devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }} + # nvbench: + # name: NVBench CUDA${{ matrix.cuda_host_combination }} + # permissions: + # id-token: write + # contents: read + # needs: compute-matrix + # uses: ./.github/workflows/dispatch-build-and-test.yml + # strategy: + # fail-fast: false + # matrix: + # cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }} + # with: + # project_name: "nvbench" + # per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }} + # devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }} python-wheels: name: Python Wheels @@ -83,16 +83,16 @@ jobs: contents: read uses: ./.github/workflows/build-and-test-python-wheels.yml - verify-devcontainers: - name: Verify Dev Containers - if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }} - needs: compute-matrix - permissions: - id-token: write - contents: read - uses: ./.github/workflows/verify-devcontainers.yml - with: - base_sha: ${{ needs.compute-matrix.outputs.base_sha }} + # verify-devcontainers: + # name: Verify Dev Containers + # if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }} + # needs: compute-matrix + # permissions: + # id-token: write + # contents: read + # uses: ./.github/workflows/verify-devcontainers.yml + # with: + # base_sha: ${{ needs.compute-matrix.outputs.base_sha }} # This job is the final job that runs after all other jobs and is used for branch protection status checks. # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks @@ -102,9 +102,9 @@ jobs: name: CI if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success needs: - - nvbench + # - nvbench - python-wheels - - verify-devcontainers + # - verify-devcontainers steps: - name: Check status of all precursor jobs if: >- From 9af979d3f305ceb2f616e29f401ea1dd982d412a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 17:06:20 -0500 Subject: [PATCH 15/20] Add exclusions --- ci/build_multi_cuda_wheel.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh index a25cf39f..29f7ac41 100755 --- a/ci/build_multi_cuda_wheel.sh +++ b/ci/build_multi_cuda_wheel.sh @@ -163,6 +163,10 @@ for wheel in wheelhouse_merged/pynvbench-*.whl; do $PYTHON -m auditwheel repair \ --exclude 'libcuda.so.1' \ --exclude 'libnvidia-ml.so.1' \ + --exclude 'libcupti.so.12' \ + --exclude 'libcupti.so.13' \ + --exclude 'libnvperf_host.so' \ + --exclude 'libnvperf_target.so' \ "$wheel" \ --wheel-dir wheelhouse_final done From ec3615b1a5f17f1220b1f03311b4aaf92e779c5e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Dec 2025 18:12:28 -0500 Subject: [PATCH 16/20] Revert "Reduce CI matrix" This reverts commit 13cb606165d7beb364f243c0d41c72f79d534c8e. --- .../build-and-test-python-wheels.yml | 6 +- .github/workflows/pr.yml | 102 +++++++++--------- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/.github/workflows/build-and-test-python-wheels.yml b/.github/workflows/build-and-test-python-wheels.yml index 8ca1a179..46f391bc 100644 --- a/.github/workflows/build-and-test-python-wheels.yml +++ b/.github/workflows/build-and-test-python-wheels.yml @@ -18,7 +18,7 @@ jobs: strategy: fail-fast: false matrix: - python: ['3.13'] + python: ['3.10', '3.11', '3.12', '3.13'] steps: - name: Checkout repository @@ -54,8 +54,8 @@ jobs: strategy: fail-fast: false matrix: - cuda: ['13'] - python: ['3.13'] + cuda: ['12', '13'] + python: ['3.10', '3.11', '3.12', '3.13'] steps: - name: Checkout repository diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 3653f510..c42cda01 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -35,46 +35,46 @@ permissions: pull-requests: read jobs: - # compute-matrix: - # name: Compute matrix - # runs-on: ubuntu-latest - # outputs: - # DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}} - # PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}} - # PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}} - # base_sha: ${{ steps.export-pr-info.outputs.base_sha }} - # pr_number: ${{ steps.export-pr-info.outputs.pr_number }} - # steps: - # - name: Checkout repo - # uses: actions/checkout@v4 - # - name: Lookup PR info - # id: get-pr-info - # uses: nv-gha-runners/get-pr-info@main - # - name: Export PR info - # id: export-pr-info - # run: | - # echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}" - # echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}" - # - name: Compute matrix outputs - # id: set-outputs - # run: | - # .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request + compute-matrix: + name: Compute matrix + runs-on: ubuntu-latest + outputs: + DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}} + PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}} + PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}} + base_sha: ${{ steps.export-pr-info.outputs.base_sha }} + pr_number: ${{ steps.export-pr-info.outputs.pr_number }} + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Lookup PR info + id: get-pr-info + uses: nv-gha-runners/get-pr-info@main + - name: Export PR info + id: export-pr-info + run: | + echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}" + echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}" + - name: Compute matrix outputs + id: set-outputs + run: | + .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request - # nvbench: - # name: NVBench CUDA${{ matrix.cuda_host_combination }} - # permissions: - # id-token: write - # contents: read - # needs: compute-matrix - # uses: ./.github/workflows/dispatch-build-and-test.yml - # strategy: - # fail-fast: false - # matrix: - # cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }} - # with: - # project_name: "nvbench" - # per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }} - # devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }} + nvbench: + name: NVBench CUDA${{ matrix.cuda_host_combination }} + permissions: + id-token: write + contents: read + needs: compute-matrix + uses: ./.github/workflows/dispatch-build-and-test.yml + strategy: + fail-fast: false + matrix: + cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }} + with: + project_name: "nvbench" + per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }} + devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }} python-wheels: name: Python Wheels @@ -83,16 +83,16 @@ jobs: contents: read uses: ./.github/workflows/build-and-test-python-wheels.yml - # verify-devcontainers: - # name: Verify Dev Containers - # if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }} - # needs: compute-matrix - # permissions: - # id-token: write - # contents: read - # uses: ./.github/workflows/verify-devcontainers.yml - # with: - # base_sha: ${{ needs.compute-matrix.outputs.base_sha }} + verify-devcontainers: + name: Verify Dev Containers + if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }} + needs: compute-matrix + permissions: + id-token: write + contents: read + uses: ./.github/workflows/verify-devcontainers.yml + with: + base_sha: ${{ needs.compute-matrix.outputs.base_sha }} # This job is the final job that runs after all other jobs and is used for branch protection status checks. # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks @@ -102,9 +102,9 @@ jobs: name: CI if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success needs: - # - nvbench + - nvbench - python-wheels - # - verify-devcontainers + - verify-devcontainers steps: - name: Check status of all precursor jobs if: >- From d85f6ba6d59be7d99aa88d5dc42411ab8d68fe71 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Tue, 27 Jan 2026 22:25:45 +0000 Subject: [PATCH 17/20] Use appropriate cmake variable to get CUDA major version --- python/CMakeLists.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 36c474e5..7f8548c4 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -26,9 +26,8 @@ CPMAddPackage( CPMAddPackage("gh:pybind/pybind11@3.0.1") # Determine CUDA major version for directory structure -string(REGEX MATCH "^([0-9]+)" CUDA_VERSION_MAJOR "${CUDAToolkit_VERSION}") -set(CUDA_VERSION_DIR "cu${CUDA_VERSION_MAJOR}") -message(STATUS "Building extension for CUDA ${CUDA_VERSION_MAJOR}, output directory: cuda/bench/${CUDA_VERSION_DIR}") +set(CUDA_VERSION_DIR "cu${CUDAToolkit_VERSION_MAJOR}") +message(STATUS "Building extension for CUDA ${CUDAToolkit_VERSION_MAJOR}, output directory: cuda/bench/${CUDA_VERSION_DIR}") add_library(_nvbench MODULE src/py_nvbench.cpp) target_include_directories(_nvbench PRIVATE ${Python_INCLUDE_DIRS}) From b06d87c88fcfda86af9c18ee06d73e933822425b Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Tue, 27 Jan 2026 22:27:29 +0000 Subject: [PATCH 18/20] Reuse cuda image name to reduce redundancy --- ci/build_multi_cuda_wheel.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ci/build_multi_cuda_wheel.sh b/ci/build_multi_cuda_wheel.sh index 29f7ac41..058dd311 100755 --- a/ci/build_multi_cuda_wheel.sh +++ b/ci/build_multi_cuda_wheel.sh @@ -54,13 +54,14 @@ readonly devcontainer_version=25.12 readonly devcontainer_distro=rockylinux8 if [[ "$(uname -m)" == "aarch64" ]]; then - readonly cuda12_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda12_version}-${devcontainer_distro}-py${py_version}-arm64 - readonly cuda13_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda13_version}-${devcontainer_distro}-py${py_version}-arm64 + readonly host_arch_suffix="-arm64" else - readonly cuda12_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda12_version}-${devcontainer_distro}-py${py_version} - readonly cuda13_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda13_version}-${devcontainer_distro}-py${py_version} + readonly host_arch_suffix="" fi +readonly cuda12_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda12_version}-${devcontainer_distro}-py${py_version}${host_arch_suffix} +readonly cuda13_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda13_version}-${devcontainer_distro}-py${py_version}${host_arch_suffix} + mkdir -p wheelhouse for ctk in 12 13; do From 48195f7be3e6f3a76c8b987f7969fdd918820f3c Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Tue, 27 Jan 2026 22:29:11 +0000 Subject: [PATCH 19/20] Remove unused cuda-bindings dep --- python/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index e752eca2..e4b2ca4c 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -30,10 +30,10 @@ readme = { file = "README.md", content-type = "text/markdown" } [project.optional-dependencies] # CUDA 12.x dependencies -cu12 = ["cuda-bindings>=12.0.0,<13.0.0", "nvidia-cuda-cupti-cu12"] +cu12 = ["nvidia-cuda-cupti-cu12"] # CUDA 13.x dependencies -cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti>=13.0"] +cu13 = ["nvidia-cuda-cupti>=13.0"] # Test dependencies for CUDA 12 test-cu12 = ["pynvbench[cu12]", "pytest", "cupy-cuda12x", "numba"] From 50997476b7d76ebc9e8b39a562511a3888aaa177 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Tue, 27 Jan 2026 23:18:51 +0000 Subject: [PATCH 20/20] Add cuda-bindings back to deps --- python/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index e4b2ca4c..e752eca2 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -30,10 +30,10 @@ readme = { file = "README.md", content-type = "text/markdown" } [project.optional-dependencies] # CUDA 12.x dependencies -cu12 = ["nvidia-cuda-cupti-cu12"] +cu12 = ["cuda-bindings>=12.0.0,<13.0.0", "nvidia-cuda-cupti-cu12"] # CUDA 13.x dependencies -cu13 = ["nvidia-cuda-cupti>=13.0"] +cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti>=13.0"] # Test dependencies for CUDA 12 test-cu12 = ["pynvbench[cu12]", "pytest", "cupy-cuda12x", "numba"]