From a9d8a4f3a89f6d2f9e5ad540b1aa42a4046a6db6 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Thu, 23 Jan 2025 14:57:02 -0500 Subject: [PATCH] :sparkles: bs for matx --- code/matx/CMakeLists.txt | 16 ++++- code/matx/bs.cu | 138 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+), 3 deletions(-) create mode 100644 code/matx/bs.cu diff --git a/code/matx/CMakeLists.txt b/code/matx/CMakeLists.txt index 278b845..fc9d400 100644 --- a/code/matx/CMakeLists.txt +++ b/code/matx/CMakeLists.txt @@ -11,7 +11,7 @@ FetchContent_Declare(matx GIT_TAG main ) -FetchContent_GetProperties(arrayfire) +FetchContent_GetProperties(matx) if(NOT matx_POPULATED) FetchContent_Populate(matx) add_subdirectory(${matx_SOURCE_DIR} ${matx_BINARY_DIR} EXCLUDE_FROM_ALL) @@ -27,8 +27,18 @@ if (NOT CMAKE_CXX_COMPILER) set(CMAKE_CXX_COMPILER "/usr/bin/g++") endif() -set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) +set(CMAKE_CUDA_COMPILER /opt/nvidia/hpc_sdk/Linux_x86_64/24.11/compilers/bin/nvcc) -add_executable(test test.cu) +set(CMAKE_CUDA_ARCHITECTURES 75) + +# Optimization flags +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -ftree-vectorize -march=native") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -use_fast_math -arch=compute_75 -code=sm_75") + +# Enable Link Time Optimization (LTO)4 +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -flto") + +add_executable(test bs.cu) set_property(TARGET test PROPERTY CXX_STANDARD 20) target_link_libraries(test PRIVATE matx::matx) diff --git a/code/matx/bs.cu b/code/matx/bs.cu new file mode 100644 index 0000000..e81bf90 --- /dev/null +++ b/code/matx/bs.cu @@ -0,0 +1,138 @@ +#include "matx.h" +#include +#include +#include +#include + +using namespace matx; + +/** + * MatX uses C++ expression templates to build arithmetic expressions that compile into a lazily-evaluated + * type for executing on the device. Currently, nvcc cannot see certain optimizations + * when building the expression tree that would be obvious by looking at the code. Specifically any code reusing + * the same tensor multiple times appears to the compiler as separate tensors, and it may issue multiple load + * instructions. While caching helps, this can have a slight performance impact when compared to native CUDA + * kernels. To work around this problem, complex expressions can be placed in a custom operator by adding some + * boilerplate code around the original expression. This custom operator can then be used either alone or inside + * other arithmetic expressions, and only a single load is issues for each tensor. + * + * This example uses the Black-Scholes equtation to demonstrate the two ways to implement the equation in MatX, and + * shows the performance difference. + */ + +/* Custom operator */ +template +class BlackScholes : public BaseOp> +{ +private: + O out_; + I1 K_; + I2 V_; + I3 S_; + I4 r_; + I5 T_; + +public: + BlackScholes(O out, I1 K, I2 V, I3 S, I4 r, I5 T) + : out_(out), V_(V), S_(S), K_(K), r_(r), T_(T) {} + + __device__ inline void operator()(index_t idx) + { + auto V = V_(); + auto K = K_(); + auto S = S_(idx); + auto T = T_(); + auto r = r_(); + + auto VsqrtT = V * sqrt(T); + auto d1 = (log(S / K) + (r + 0.5f * V * V) * T) / VsqrtT; + auto d2 = d1 - VsqrtT; + auto cdf_d1 = normcdff(d1); + auto cdf_d2 = normcdff(d2); + auto expRT = exp(-1.f * r * T); + + out_(idx) = S * cdf_d1 - K * expRT * cdf_d2; + } + + __host__ __device__ inline index_t Size(uint32_t i) const { return out_.Size(i); } + static inline constexpr __host__ __device__ int32_t Rank() { return O::Rank(); } +}; + +template +void compute_black_scholes_matx(tensor_t &K, + tensor_t &S, + tensor_t &V, + tensor_t &r, + tensor_t &T, + tensor_t &output, + cudaExecutor &exec) +{ + auto VsqrtT = V * sqrt(T); + auto d1 = (log(S / K) + (r + 0.5f * V * V) * T) / VsqrtT; + auto d2 = d1 - VsqrtT; + auto cdf_d1 = normcdf(d1); + auto cdf_d2 = normcdf(d2); + auto expRT = exp(-1.f * r * T); + + (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec); +} + +int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) +{ + MATX_ENTER_HANDLER(); + + using dtype = float; + + index_t input_size = 100000; + auto output_tensor = make_tensor({input_size}); + auto S_tensor = make_tensor({input_size}); + auto K_tensor = make_tensor({}); + auto V_tensor = make_tensor({}); + auto r_tensor = make_tensor({}); + auto T_tensor = make_tensor({}); + float time_ms; + int num_iterations = 99; + + for (index_t i = 0; i < input_size; i++) + { + S_tensor(i) = (dtype)90 + dtype(i % 20); + } + K_tensor() = (dtype)100.; + V_tensor() = (dtype)0.1; + r_tensor() = (dtype)0.05; + T_tensor() = (dtype)1.0; + + cudaStream_t stream; + cudaStreamCreate(&stream); + cudaExecutor exec{stream}; + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + BlackScholes(output_tensor, K_tensor, V_tensor, S_tensor, r_tensor, T_tensor).run(exec); + exec.sync(); + + cudaEventRecord(start, stream); + for (int i = 0; i < num_iterations; i++) + { + BlackScholes(output_tensor, K_tensor, V_tensor, S_tensor, r_tensor, T_tensor).run(exec); + } + cudaEventRecord(stop, stream); + exec.sync(); + cudaEventElapsedTime(&time_ms, start, stop); + + printf("Black-Scholes time = %.2fus per iteration\n", + time_ms * 1e3 / num_iterations); + + compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec); + + printf("First 20 values of computed Black-Scholes output:\n"); + for (index_t i = 0; i < 20; i++) + { + printf("%f\n", static_cast(output_tensor(i))); + } + + cudaStreamDestroy(stream); + MATX_CUDA_CHECK_LAST_ERROR(); + MATX_EXIT_HANDLER(); +}