Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,15 @@ site/
docs/
.vscode/
*.egg-info/
CMakeFiles/
_C.*.so
_C.so
__pycache__
*.ncu-rep
*.nsys-rep
*.sqlite
*.log
test
version.py
compile_commands.json
_codeql_detected_source_root
136 changes: 76 additions & 60 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,26 +1,67 @@
cmake_minimum_required(VERSION 3.18)
cmake_minimum_required(VERSION 3.27 FATAL_ERROR)

# Detect CUDA toolkit: tries host installation first, then falls back to
# pip-installed packages (env WITH_PIP_CUDA_TOOLCHAIN or auto-detect).
# Must be included before project() so CMAKE_CUDA_COMPILER is set.
include(${CMAKE_CURRENT_LIST_DIR}/cmake/FindPipCUDAToolkit.cmake)

project(hpc_ops LANGUAGES CXX CUDA)

enable_language(CUDA)
set(CMAKE_BUILD_TYPE "Release")
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
set(CMAKE_CUDA_RUNTIME_LIBRARY None)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

find_package(CUDAToolkit REQUIRED)
find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
set(CMAKE_CUDA_ARCHITECTURES "90a")

file(GLOB_RECURSE SOURCES "src/*/*.cu" "src/*/*.cc")
list(FILTER SOURCES EXCLUDE REGEX ".*test.*")
find_package(CUDAToolkit REQUIRED)
include_directories(${CUDAToolkit_INCLUDE_DIRS})
link_directories(${CUDAToolkit_LIBRARY_DIR} ${CUDAToolkit_LIBRARY_DIR}/stubs)

find_program(CCACHE_FOUND ccache)
if(CCACHE_FOUND)
set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_FOUND}")
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_FOUND}")
set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_FOUND}")
endif()

find_package(
Python
COMPONENTS Interpreter
REQUIRED
)
find_package(tvm_ffi CONFIG REQUIRED)

if(NOT "${SKBUILD_SABI_VERSION}" STREQUAL "")
set(USE_SABI USE_SABI ${SKBUILD_SABI_VERSION})
endif()

add_library(_C MODULE ${SOURCES})
include_directories(
./
src/utils/include
3rd/cutlass/include
)

# Collect all CUDA source files (kernels)
file(GLOB_RECURSE CUDA_SOURCES "src/*/*.cu")
list(FILTER CUDA_SOURCES EXCLUDE REGEX ".*test.*")
# Exclude built_json.cu as it's compiled separately
list(FILTER CUDA_SOURCES EXCLUDE REGEX ".*/C/built_json\\.cu$")

# Collect all CC source files (entry points)
file(GLOB_RECURSE CC_SOURCES "src/*/*.cc")
list(FILTER CC_SOURCES EXCLUDE REGEX ".*test.*")
# Exclude C.cc placeholder
list(FILTER CC_SOURCES EXCLUDE REGEX ".*/C/C\\.cc$")

# Build all kernels as a single shared library
add_library(_C SHARED ${CUDA_SOURCES} ${CC_SOURCES} src/C/built_json.cu)
target_link_libraries(_C PRIVATE tvm_ffi::shared cuda cudart)
set_target_properties(
_C PROPERTIES
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/

OUTPUT_NAME "_C"
PREFIX ""
SUFFIX ".abi3.so"
CUDA_RUNTIME_LIBRARY "Shared"
SUFFIX ".so"
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/

CUDA_SEPARABLE_COMPILATION OFF
CUDA_RESOLVE_DEVICE_SYMBOLS ON
Expand All @@ -29,7 +70,6 @@ set_target_properties(
C_VISIBILITY_PRESET "hidden"
CXX_VISIBILITY_PRESET "hidden"
VISIBILITY_INLINES_HIDDEN ON
CUDA_VISIBILITY_PRESET "hidden"

CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
Expand All @@ -42,59 +82,35 @@ set_target_properties(
CUDA_ARCHITECTURES "90a"
)

if(NOT DEFINED HPC_GIT_HASH_STR OR HPC_GIT_HASH_STR STREQUAL "")
execute_process(
COMMAND git rev-parse --short=7 HEAD
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE HPC_GIT_HASH_STR
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
if(NOT HPC_GIT_HASH_STR)
set(HPC_GIT_HASH_STR "unknown")
endif()
endif()

if(NOT DEFINED HPC_VERSION_STR OR HPC_VERSION_STR STREQUAL "")
set(HPC_VERSION_STR "0.0.1-dev")
endif()

target_compile_definitions(
_C PRIVATE
Py_LIMITED_API=0x03090000
_GLIBCXX_USE_CXX11_ABI=1
HPC_GIT_HASH_STR=${HPC_GIT_HASH_STR}
HPC_VERSION_STR=${HPC_VERSION_STR}
)

execute_process(
COMMAND python3 -c "
from torch.utils.cpp_extension import include_paths
print(';'.join(include_paths()), end='')
"
OUTPUT_VARIABLE TORCH_INCLUDE_PATHS
)

execute_process(
COMMAND python3 -c "
from torch.utils.cpp_extension import library_paths
print(';'.join(library_paths()), end='')
"
OUTPUT_VARIABLE TORCH_LIBRARY_PATHS
)


target_include_directories(
_C PRIVATE
./
3rd/cutlass/include
${CUDAToolkit_INCLUDE_DIRS}
${TORCH_INCLUDE_PATHS}
)

target_link_directories(
_C PRIVATE
${TORCH_LIBRARY_PATHS}
)

target_link_libraries(
_C PRIVATE
cuda
c10
torch
torch_cpu
cudart
c10_cuda
torch_cuda
HPC_GIT_HASH_STR="${HPC_GIT_HASH_STR}"
HPC_VERSION_STR="${HPC_VERSION_STR}"
)

target_compile_options(
_C PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:
-Werror=all-warnings
-Wno-error=deprecated-declarations
-lineinfo
--expt-relaxed-constexpr
-std=c++17
Expand All @@ -108,7 +124,7 @@ target_compile_options(
-g
-fwrapv
-Wall
-DTORCH_API_INCLUDE_EXTENSION_H
-DTORCH_EXTENSION_NAME=_C
>
)

install(TARGETS _C LIBRARY DESTINATION hpc/ops)
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ CSRC_FILES=$(CC_FILES) $(CU_FILES) $(CUH_FILES) $(H_FILES)


all:
python3 setup.py build
pip install --no-build-isolation -e .

wheel:
find . -type d -name "__pycache__" -exec rm -rf {} +
Expand Down
28 changes: 26 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,37 @@ HPC-Ops is a **production-grade, high-performance, and easy-to-use** operator li
*You can set up the environment by installing the modules listed in requirements-dev.txt.*

### Install from Source

#### With host CUDA toolchain
```bash
git clone https://github.com/Tencent/hpc-ops.git
cd hpc-ops

# build packages
# Ensure CUDA toolkit is installed on the host (e.g., /usr/local/cuda)
pip install . -v
```

#### With pip-provided CUDA toolchain (no host CUDA required)

Option A — pip toolchain in the current environment (use `--no-build-isolation`):

```bash
pip install nvidia-cuda-nvcc nvidia-cuda-cccl scikit-build-core cmake ninja
pip install . -v --no-build-isolation
```

Option B — pip toolchain in another virtualenv or path:

```bash
# Point to the cu<ver> directory inside another venv's site-packages
export WITH_PIP_CUDA_TOOLCHAIN=/path/to/venv/lib/python3.x/site-packages/nvidia/cu13
pip install . -v
```

#### Build wheel
```bash
make wheel
python3 -m pip install dist/*.whl
python3 -m pip install dist/*.whl
```

### Basic Usage
Expand Down
70 changes: 70 additions & 0 deletions cmake/FindPipCUDAToolkit.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# FindPipCUDAToolkit.cmake
#
# Locate CUDA toolkit — first trying the host system, then falling back
# to pip-installed packages (nvidia-cuda-nvcc, nvidia-cuda-cccl).
#
# This module should be included BEFORE project() to set CMAKE_CUDA_COMPILER
# when pip CUDA is used.
#
# Detection order:
# 1. Try find_package(CUDAToolkit QUIET) — succeeds if a host CUDA
# installation is available; skip pip detection.
# 2. If env var WITH_PIP_CUDA_TOOLCHAIN is set to a path (e.g., .../cu13),
# use that directory directly as the CUDA toolkit root.
# 3. Otherwise, try auto-detecting from the current Python environment's
# site-packages (works with --no-build-isolation).

# --- Try host CUDA first ---
find_package(CUDAToolkit QUIET)
if(CUDAToolkit_FOUND)
return()
endif()

find_program(_PIP_CUDA_PYTHON_EXE NAMES python3 python)
if(NOT _PIP_CUDA_PYTHON_EXE)
return()
endif()

# --- Strategy 1: explicit path via env var ---
if(DEFINED ENV{WITH_PIP_CUDA_TOOLCHAIN})
set(_PIP_CUDA_ROOT "$ENV{WITH_PIP_CUDA_TOOLCHAIN}")
if(NOT EXISTS "${_PIP_CUDA_ROOT}/bin/nvcc")
message(FATAL_ERROR
"FindPipCUDAToolkit: WITH_PIP_CUDA_TOOLCHAIN is set to '${_PIP_CUDA_ROOT}' "
"but nvcc was not found at '${_PIP_CUDA_ROOT}/bin/nvcc'")
endif()
# Prepare the directory (create lib64 symlink, unversioned .so symlinks,
# libcuda.so stub) that CMake / nvcc expect but pip packages omit.
execute_process(
COMMAND "${_PIP_CUDA_PYTHON_EXE}" "${CMAKE_CURRENT_LIST_DIR}/find_pip_cuda.py"
"${_PIP_CUDA_ROOT}"
OUTPUT_QUIET
)
message(STATUS "FindPipCUDAToolkit: using env WITH_PIP_CUDA_TOOLCHAIN=${_PIP_CUDA_ROOT}")
else()
# --- Strategy 2: auto-detect from current Python env ---
execute_process(
COMMAND "${_PIP_CUDA_PYTHON_EXE}" "${CMAKE_CURRENT_LIST_DIR}/find_pip_cuda.py"
OUTPUT_VARIABLE _PIP_CUDA_OUTPUT
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE _PIP_CUDA_RESULT
)

if(NOT _PIP_CUDA_RESULT EQUAL 0)
message(STATUS "FindPipCUDAToolkit: pip-installed CUDA toolkit not found")
return()
endif()

string(JSON _PIP_CUDA_ROOT GET "${_PIP_CUDA_OUTPUT}" "root")
message(STATUS "FindPipCUDAToolkit: auto-detected from Python environment")
endif()

# --- Common pip-CUDA setup ---
set(CMAKE_CUDA_COMPILER "${_PIP_CUDA_ROOT}/bin/nvcc" CACHE FILEPATH "CUDA compiler (from pip)" FORCE)
set(CUDAToolkit_ROOT "${_PIP_CUDA_ROOT}" CACHE PATH "CUDA toolkit root (from pip)" FORCE)

list(APPEND CMAKE_LIBRARY_PATH "${_PIP_CUDA_ROOT}/lib/stubs" "${_PIP_CUDA_ROOT}/lib")

message(STATUS "FindPipCUDAToolkit: using pip-installed CUDA toolkit")
message(STATUS " nvcc: ${CMAKE_CUDA_COMPILER}")
message(STATUS " root: ${CUDAToolkit_ROOT}")
Loading