diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml index f73442b6fd5..fd7d04c0843 100644 --- a/.github/workflows/metax_work.yaml +++ b/.github/workflows/metax_work.yaml @@ -40,7 +40,7 @@ jobs: git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head git checkout pull/${{ github.event.pull_request.number }}/head - # git submodule update --init --recursive + git submodule update --init --recursive fi diff --git a/Paddle b/Paddle index 2588f489910..47699dd459f 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit 2588f4899106cd27bdfcc84ba4c2f5f7aac570ab +Subproject commit 47699dd459fdc8e525beac030d5c939b42128057 diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index e357a5e5912..3e92996f9a2 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -703,9 +703,9 @@ file( set(CUSTOM_DEVICE_SRCS ${CUDA_SRCS} ${CC_SRCS} ${ERNIE_CORE_SRCS}) set_source_files_properties(${CUSTOM_DEVICE_SRCS} PROPERTIES LANGUAGE CUDA) - +set(MACA_PATH $ENV{MACA_PATH}) set(CMAKE_CUCC_COMPILER "cucc") -set(CMAKE_CUCC_FLAGS "-I /opt/maca/tools/cu-bridge/include/") +set(CMAKE_CUCC_FLAGS "-I ${MACA_PATH}/tools/cu-bridge/include/") add_library(${TARGET_NAME} SHARED ${CUSTOM_DEVICE_SRCS}) @@ -734,9 +734,9 @@ target_link_libraries( ${WARPRNNT_LIBRARIES} ${PADDLE_CORE_LIB}) -target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so) -target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so) -target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcpti.so) +target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmccl.so) +target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcFlashAttn.so) +target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcpti.so) include_directories(BEFORE ${PADDLE_SOURCE_DIR}) diff --git a/backends/metax_gpu/cmake/dgc.cmake b/backends/metax_gpu/cmake/dgc.cmake index 4c54e636d5e..4c61f2e6bcb 100644 --- a/backends/metax_gpu/cmake/dgc.cmake +++ b/backends/metax_gpu/cmake/dgc.cmake @@ -62,8 +62,8 @@ if(EXISTS ${DGC_DOWNLOAD_DIR}/${DGC_CACHE_FILENAME}) else() download_dgc() endif() - -set(CU_BRIDGE_PATH "/opt/maca/tools/cu-bridge") +set(MACA_PATH $ENV{MACA_PATH}) +set(CU_BRIDGE_PATH "${MACA_PATH}/tools/cu-bridge") add_custom_command( OUTPUT "${CU_BRIDGE_PATH}/bin/nvcc" diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu index 6cdfb2f5242..6c46ef10c0f 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu @@ -119,7 +119,22 @@ void ActivationGradGPUImpl(const Context& dev_ctx, ActivationGradGPUImpl>( \ dev_ctx, &x, nullptr, &dout, dx, functor); \ } - +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_DOUBLE_ATTRS_DEPX( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + double attr1, \ + double attr2, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradGPUImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } #define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ @@ -239,10 +254,10 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh, scale_a, scale_b); -DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus, - CudaSoftplusGradFunctor, - beta, - threshold); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_DOUBLE_ATTRS_DEPX(Softplus, + CudaSoftplusGradFunctor, + beta, + threshold); DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, CudaHardSigmoidGradFunctor, slope, diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu index f24f3e8abbc..363932cfc28 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu @@ -90,7 +90,21 @@ void ActivationGPUImpl(const Context& dev_ctx, ActivationGPUImpl>( \ dev_ctx, x, out, functor); \ } - +#define DEFINE_GPU_ACT_KERNEL_WITH_TWO_DOUBLE_ATTRS( \ + name, functor_class, attr1, attr2) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + double attr1, \ + double attr2, \ + DenseTensor* out) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGPUImpl>( \ + dev_ctx, x, out, functor); \ + } DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor) @@ -139,10 +153,10 @@ DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardTanh, t_min, t_max) DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b) -DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus, - CudaSoftplusFunctor, - beta, - threshold) +DEFINE_GPU_ACT_KERNEL_WITH_TWO_DOUBLE_ATTRS(Softplus, + CudaSoftplusFunctor, + beta, + threshold) DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid, CudaHardSigmoidFunctor, slope, diff --git a/backends/metax_gpu/kernels/funcs/blas/blas.h b/backends/metax_gpu/kernels/funcs/blas/blas.h index fa4b4643f89..75ea8c921e2 100644 --- a/backends/metax_gpu/kernels/funcs/blas/blas.h +++ b/backends/metax_gpu/kernels/funcs/blas/blas.h @@ -282,6 +282,9 @@ class Blas { template T DOT(int n, const T* x, const T* y) const; + template + void CUDOT( + int n, const T* x, int incx, const T* y, int incy, T* result) const; template void SCAL(int n, const T a, T* x) const; @@ -541,7 +544,10 @@ class BlasT : private Blas { T DOT(ARGS... args) const { return Base()->template DOT(args...); } - + template + void CUDOT(ARGS... args) const { + Base()->template CUDOT(args...); + } template void SCAL(ARGS... args) const { Base()->template SCAL(args...); diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index f2e4f067bb2..7ba32b5b399 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -942,6 +942,19 @@ index f0cca0f701..02ea957240 100644 namespace phi { // To determine use cudnn or not. +diff --git a/paddle/phi/kernels/gpu/dot_kernel.cu b/paddle/phi/kernels/gpu/dot_kernel.cu +index af27ac89ab..ee0edc6b8e 100644 +--- a/paddle/phi/kernels/gpu/dot_kernel.cu ++++ b/paddle/phi/kernels/gpu/dot_kernel.cu +@@ -15,7 +15,7 @@ + #include "paddle/phi/kernels/dot_kernel.h" + #include "paddle/phi/backends/gpu/gpu_context.h" + #include "paddle/phi/core/kernel_registry.h" +-#include "paddle/phi/kernels/funcs/blas/blas.h" ++#include "kernels/funcs/blas/blas.h" + #include "paddle/phi/kernels/funcs/eigen/common.h" + + #include "paddle/phi/kernels/full_kernel.h" diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h index 29fa252e96..4ae72b0935 100644 --- a/paddle/phi/kernels/gpu/gelu_funcs.h