From ddf00716a35cd4f6964dc49c912ab6cb34ff83b9 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Thu, 18 Jul 2024 18:16:12 +1000 Subject: [PATCH 01/27] HIP compilation --- src/xc_integrator/local_work_driver/device/scheme1_base.cxx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx index f42aff3c..452d239b 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx +++ b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx @@ -441,9 +441,9 @@ void AoSScheme1Base::eval_collocation_hessian( XCDeviceData* _data ) { eval_collocation_shell_to_task_hessian( max_l, data->l_batched_shell_to_task.data(), aos_stack.device_tasks, data->device_backend_->queue() ); -#endif data->device_backend_->check_error("collocation hess" __FILE__ ": " + std::to_string(__LINE__)); +#endif } void AoSScheme1Base::eval_collocation_laplacian( XCDeviceData* _data ) { @@ -461,9 +461,9 @@ void AoSScheme1Base::eval_collocation_laplacian( XCDeviceData* _data ) { eval_collocation_shell_to_task_laplacian( max_l, data->l_batched_shell_to_task.data(), aos_stack.device_tasks, data->device_backend_->queue() ); -#endif data->device_backend_->check_error("collocation lapl" __FILE__ ": " + std::to_string(__LINE__)); +#endif } From 6070afd5473a5c10d74e99cc628b475152544e4c Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Thu, 18 Jul 2024 19:07:20 +1000 Subject: [PATCH 02/27] Add hip version of mgga kernels --- .../device/hip/kernels/zmat_vxc.hip | 198 ++++++++++++++++++ 1 file changed, 198 insertions(+) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip index d188e9e6..f4768325 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip @@ -143,6 +143,204 @@ void zmat_gga_vxc( size_t ntasks, } +template +__global__ void zmat_mgga_vxc_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + const auto* vrho_device = task.vrho; + const auto* vgamma_device = task.vgamma; + const double* vlapl_device = need_lapl ? task.vlapl : nullptr; + const auto* den_x_eval_device = task.ddenx; + const auto* den_y_eval_device = task.ddeny; + const auto* den_z_eval_device = task.ddenz; + + const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + const double* d2basis_lapl_eval_device = + need_lapl ? task.d2bflapl : nullptr; + + + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + const double fact_1 = 0.5 * vrho_device[tid_x] ; + const double fact_2 = 2.0 * vgamma_device[tid_x]; + + const double dx = den_x_eval_device[ tid_x ] * dbasis_x_eval_device[ ibfoff ]; + const double dy = den_y_eval_device[ tid_x ] * dbasis_y_eval_device[ ibfoff ]; + const double dz = den_z_eval_device[ tid_x ] * dbasis_z_eval_device[ ibfoff ]; + + double val = + fact_1 * basis_eval_device[ ibfoff ] + fact_2 * ( dx + dy + dz ); + + if constexpr (need_lapl) { + val += vlapl_device[tid_x] * d2basis_lapl_eval_device[ibfoff]; + } + + z_matrix_device[ ibfoff ] = val; + } +} + +void zmat_mgga_vxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + bool do_lapl, + device_queue queue ) { + + cudaStream_t stream = queue.queue_as() ; + + + dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1); + dim3 blocks( util::div_ceil( max_npts, threads.x ), + util::div_ceil( max_nbf, threads.y ), + ntasks ); + + if(do_lapl) + zmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + else + zmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + +} + + + + + + + + + + + + + + + + +template +__global__ void mmat_mgga_vxc_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + const auto* vtau_device = task.vtau; + const double* vlapl_device = need_lapl ? task.vlapl : nullptr; + + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + + auto* mmat_x = task.xmat_x; + auto* mmat_y = task.xmat_y; + auto* mmat_z = task.xmat_z; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + const double fact_1 = 0.25 * vtau_device[tid_x] + + (need_lapl ? vlapl_device[tid_x] : 0.0); + + mmat_x[ ibfoff ] = fact_1 * dbasis_x_eval_device[ ibfoff ]; + mmat_y[ ibfoff ] = fact_1 * dbasis_y_eval_device[ ibfoff ]; + mmat_z[ ibfoff ] = fact_1 * dbasis_z_eval_device[ ibfoff ]; + } +} + +//__global__ void print_zmat_stats( size_t ntasks, +// XCDeviceTask* tasks_device) { +// +// for(size_t iT = 0; iT < ntasks; ++iT) { +// auto& task = tasks_device[iT]; +// const auto npts = task.npts; +// const auto nbf = task.bfn_screening.nbe; +// +// const auto* zmat = task.zmat; +// const auto* bmat = task.bf; +// const auto* blmat = task.d2bflapl; +// +// double znrm = 0.0, bnrm = 0.0, blnrm = 0.0; +// for(auto j = 0; j < npts*nbf; ++j) { +// znrm += zmat[j] * zmat[j]; +// bnrm += bmat[j] * bmat[j]; +// blnrm += blmat[j] * blmat[j]; +// } +// +// const auto* eps = task.eps; +// const auto* vgamma = task.vgamma; +// const auto* vtau = task.vtau; +// const auto* vlapl = task.vlapl; +// const auto* vrho = task.vrho; +// const auto* gamma = task.gamma; +// const auto* tau = task.tau; +// const auto* lapl = task.denlapl; +// const auto* rho = task.den; +// double enrm = 0.0, gnrm = 0.0, tnrm = 0.0, rnrm = 0.0, lnrm = 0.0; +// double vgnrm = 0.0, vtnrm = 0.0, vrnrm = 0.0, vlnrm = 0.0; +// for(auto j = 0; j < npts; ++j) { +// enrm += eps[j] * eps[j]; +// vrnrm += vrho[j] * vrho[j]; +// vgnrm += vgamma[j] * vgamma[j]; +// vtnrm += vtau[j] * vtau[j]; +// vlnrm += vlapl[j] * vlapl[j]; +// +// rnrm += rho[j] * rho[j]; +// gnrm += gamma[j] * gamma[j]; +// tnrm += tau[j] * tau[j]; +// lnrm += lapl[j] * lapl[j]; +// } +// +// printf("ITASK = %lu B = %.6e BL = %.6e R = %.6e G = %.6e T = %.6e L = %.6e E = %.6e VR = %.6e VG = %6e VT = %.6e VL = %.6e Z = %.6e \n", +// iT, bnrm, blnrm, rnrm, gnrm, tnrm, lnrm, enrm, vrnrm, vgnrm, vtnrm, vlnrm, znrm); +// } +// +//} + +void mmat_mgga_vxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + bool do_lapl, + device_queue queue ) { + + cudaStream_t stream = queue.queue_as() ; + + + dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1); + dim3 blocks( util::div_ceil( max_npts, threads.x ), + util::div_ceil( max_nbf, threads.y ), + ntasks ); + + if(do_lapl) + mmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + else + mmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + + //print_zmat_stats<<<1,1,0,stream>>>(ntasks,tasks_device); +} + +} + From bb4559fc894d7ca4ca966f307cc6c19ee508475b Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Thu, 18 Jul 2024 20:33:32 +1000 Subject: [PATCH 03/27] Removed commented print --- .../device/hip/kernels/zmat_vxc.hip | 65 ------------------- 1 file changed, 65 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip index f4768325..93547e97 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip @@ -217,20 +217,6 @@ void zmat_mgga_vxc( size_t ntasks, } - - - - - - - - - - - - - - template __global__ void mmat_mgga_vxc_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { @@ -267,55 +253,6 @@ __global__ void mmat_mgga_vxc_kernel( size_t ntasks, } } -//__global__ void print_zmat_stats( size_t ntasks, -// XCDeviceTask* tasks_device) { -// -// for(size_t iT = 0; iT < ntasks; ++iT) { -// auto& task = tasks_device[iT]; -// const auto npts = task.npts; -// const auto nbf = task.bfn_screening.nbe; -// -// const auto* zmat = task.zmat; -// const auto* bmat = task.bf; -// const auto* blmat = task.d2bflapl; -// -// double znrm = 0.0, bnrm = 0.0, blnrm = 0.0; -// for(auto j = 0; j < npts*nbf; ++j) { -// znrm += zmat[j] * zmat[j]; -// bnrm += bmat[j] * bmat[j]; -// blnrm += blmat[j] * blmat[j]; -// } -// -// const auto* eps = task.eps; -// const auto* vgamma = task.vgamma; -// const auto* vtau = task.vtau; -// const auto* vlapl = task.vlapl; -// const auto* vrho = task.vrho; -// const auto* gamma = task.gamma; -// const auto* tau = task.tau; -// const auto* lapl = task.denlapl; -// const auto* rho = task.den; -// double enrm = 0.0, gnrm = 0.0, tnrm = 0.0, rnrm = 0.0, lnrm = 0.0; -// double vgnrm = 0.0, vtnrm = 0.0, vrnrm = 0.0, vlnrm = 0.0; -// for(auto j = 0; j < npts; ++j) { -// enrm += eps[j] * eps[j]; -// vrnrm += vrho[j] * vrho[j]; -// vgnrm += vgamma[j] * vgamma[j]; -// vtnrm += vtau[j] * vtau[j]; -// vlnrm += vlapl[j] * vlapl[j]; -// -// rnrm += rho[j] * rho[j]; -// gnrm += gamma[j] * gamma[j]; -// tnrm += tau[j] * tau[j]; -// lnrm += lapl[j] * lapl[j]; -// } -// -// printf("ITASK = %lu B = %.6e BL = %.6e R = %.6e G = %.6e T = %.6e L = %.6e E = %.6e VR = %.6e VG = %6e VT = %.6e VL = %.6e Z = %.6e \n", -// iT, bnrm, blnrm, rnrm, gnrm, tnrm, lnrm, enrm, vrnrm, vgnrm, vtnrm, vlnrm, znrm); -// } -// -//} - void mmat_mgga_vxc( size_t ntasks, int32_t max_nbf, int32_t max_npts, @@ -335,8 +272,6 @@ void mmat_mgga_vxc( size_t ntasks, mmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); else mmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); - - //print_zmat_stats<<<1,1,0,stream>>>(ntasks,tasks_device); } } From 6d2d5f09c799c8018f58622c5b8b091fb0fc8522 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Thu, 18 Jul 2024 21:21:05 +1000 Subject: [PATCH 04/27] Copy paste cleanup --- .../local_work_driver/device/hip/kernels/zmat_vxc.hip | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip index 93547e97..245eedd7 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip @@ -201,10 +201,10 @@ void zmat_mgga_vxc( size_t ntasks, bool do_lapl, device_queue queue ) { - cudaStream_t stream = queue.queue_as() ; + hipStream_t stream = queue.queue_as() ; - dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1); + dim3 threads(hip::warp_size,hip::max_warps_per_thread_block,1); dim3 blocks( util::div_ceil( max_npts, threads.x ), util::div_ceil( max_nbf, threads.y ), ntasks ); @@ -260,10 +260,10 @@ void mmat_mgga_vxc( size_t ntasks, bool do_lapl, device_queue queue ) { - cudaStream_t stream = queue.queue_as() ; + hipStream_t stream = queue.queue_as() ; - dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1); + dim3 threads(hip::warp_size,hip::max_warps_per_thread_block,1); dim3 blocks( util::div_ceil( max_npts, threads.x ), util::div_ceil( max_nbf, threads.y ), ntasks ); @@ -272,12 +272,9 @@ void mmat_mgga_vxc( size_t ntasks, mmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); else mmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); -} } - - } From 6b46eb6fc3faaaf32c2f1d81c8af370bcbc8e726 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Thu, 18 Jul 2024 21:36:56 +1000 Subject: [PATCH 05/27] More missing HIP functions --- .../device/hip/kernels/uvvars.hip | 26 +++++++++++++++++-- .../device/hip/xc_functional_eval_wrapper.cxx | 10 +++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip index 8e93d33f..d491c9dc 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip @@ -194,11 +194,33 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max, } +void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, + int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, + const double* deny, const double* denz, double* gamma, bool do_lapl, + device_queue queue ) { + hipStream_t stream = queue.queue_as(); + // U Variables + { + dim3 threads( hip::warp_size, hip::max_warps_per_thread_block / 2, 1 ); + dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), + std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )), + ntasks ); + eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + if(do_lapl) + eval_uvars_mgga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else + eval_uvars_mgga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + } - - + // V variables (GAMMA) + dim3 threads( hip::max_threads_per_thread_block ); + dim3 blocks( util::div_ceil( npts_total, threads.x ) ); + eval_vvars_gga_kernel<<< blocks, threads, 0, stream >>>( + npts_total, denx, deny, denz, gamma + ); +} } diff --git a/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx b/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx index da8544ce..96ccb3b0 100644 --- a/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx +++ b/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx @@ -27,4 +27,14 @@ void eval_kern_exc_vxc_gga( const functional_type& func, size_t npts, } +void eval_kern_exc_vxc_mgga( const functional_type& func, size_t npts, + const double* rho, const double* gamma, const double* tau, const double* lapl, + double* eps, double* vrho, double* vgamma, double* vtau, double* vlapl, + device_queue queue ) { + + hipStream_t stream = queue.queue_as(); + func.eval_exc_vxc_device( npts, rho, gamma, lapl, tau, eps, vrho, vgamma, vlapl, vtau, stream ); + +} + } From c81132fda2c7a46f948528206ae2b8ca35560c9c Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Thu, 18 Jul 2024 21:43:39 +1000 Subject: [PATCH 06/27] Missing HIP kernel --- .../device/hip/kernels/uvvars.hip | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip index d491c9dc..34b50777 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip @@ -193,6 +193,118 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max, } +template +__global__ void eval_uvars_mgga_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + constexpr auto warp_size = hip::warp_size; + //constexpr auto max_warps_per_thread_block = hip::max_warps_per_thread_block; + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + auto* tau_eval_device = task.tau; + decltype(tau_eval_device) lapl_eval_device = nullptr; + if constexpr (need_lapl) { + lapl_eval_device = task.denlapl; + } + + //const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + decltype(dbasis_x_eval_device) basis_lapl_eval_device = nullptr; + if constexpr (need_lapl) { + basis_lapl_eval_device = task.d2bflapl; + } + + //const auto* den_basis_prod_device = task.zmat; + const auto* den_basis_dx_prod_device = task.xmat_x; + const auto* den_basis_dy_prod_device = task.xmat_y; + const auto* den_basis_dz_prod_device = task.xmat_z; + decltype(den_basis_dx_prod_device) den_basis_prod_device = nullptr; + if constexpr (need_lapl) { + den_basis_prod_device = task.zmat; + } + + __shared__ double den_shared[3+!!need_lapl][warp_size][GGA_KERNEL_SM_BLOCK_Y+1]; + + for ( int bid_x = blockIdx.x * blockDim.x; + bid_x < nbf; + bid_x += blockDim.x * gridDim.x ) { + + for ( int bid_y = blockIdx.y * GGA_KERNEL_SM_BLOCK_Y; + bid_y < npts; + bid_y += GGA_KERNEL_SM_BLOCK_Y * gridDim.y ) { + + for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) { + den_shared[0][threadIdx.x][sm_y] = 0.; + den_shared[1][threadIdx.x][sm_y] = 0.; + den_shared[2][threadIdx.x][sm_y] = 0.; + if constexpr (need_lapl) + den_shared[3][threadIdx.x][sm_y] = 0.; + + if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { + const double* db_x_col = den_basis_dx_prod_device + (bid_x + sm_y)*npts; + const double* db_y_col = den_basis_dy_prod_device + (bid_x + sm_y)*npts; + const double* db_z_col = den_basis_dz_prod_device + (bid_x + sm_y)*npts; + + const double* bf_x_col = dbasis_x_eval_device + (bid_x + sm_y)*npts; + const double* bf_y_col = dbasis_y_eval_device + (bid_x + sm_y)*npts; + const double* bf_z_col = dbasis_z_eval_device + (bid_x + sm_y)*npts; + + + den_shared[0][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_x_col[ bid_y + threadIdx.x ]; + den_shared[1][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_y_col[ bid_y + threadIdx.x ]; + den_shared[2][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_z_col[ bid_y + threadIdx.x ]; + + + if constexpr (need_lapl) { + const double* db_col = den_basis_prod_device + (bid_x + sm_y)*npts; + const double* bf_l_col = basis_lapl_eval_device + (bid_x + sm_y)*npts; + den_shared[3][threadIdx.x][sm_y] = bf_l_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + } + } + } + __syncthreads(); + + + for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) { + const int tid_y = bid_y + sm_y; + + register double tx_reg = den_shared[0][sm_y][threadIdx.x]; + register double ty_reg = den_shared[1][sm_y][threadIdx.x]; + register double tz_reg = den_shared[2][sm_y][threadIdx.x]; + // Warp blocks are stored col major + register double tau_reg = 0.0; + tau_reg = 0.5 * hip::warp_reduce_sum( tx_reg ); + tau_reg += 0.5 * hip::warp_reduce_sum( ty_reg ); + tau_reg += 0.5 * hip::warp_reduce_sum( tz_reg ); + + register double lapl_reg = 0.0; + if constexpr (need_lapl) { + lapl_reg = den_shared[3][sm_y][threadIdx.x]; + lapl_reg = hip::warp_reduce_sum(lapl_reg); + lapl_reg = 2. * lapl_reg + 4. * tau_reg; + } + + if( threadIdx.x == 0 and tid_y < npts ) { + atomicAdd( tau_eval_device + tid_y, tau_reg ); + if constexpr (need_lapl) { + atomicAdd( lapl_eval_device + tid_y, lapl_reg ); + } + } + } + __syncthreads(); + } + } +} + void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, From c2e3cc2b7a3ff23f433c39f4b68616b8a9120108 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Thu, 18 Jul 2024 21:50:12 +1000 Subject: [PATCH 07/27] Removed register from hip --- .../local_work_driver/device/hip/kernels/uvvars.hip | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip index 34b50777..d5f1aa6c 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip @@ -193,6 +193,8 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max, } +#define GGA_KERNEL_SM_BLOCK_Y 32 + template __global__ void eval_uvars_mgga_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { @@ -277,16 +279,16 @@ __global__ void eval_uvars_mgga_kernel( size_t ntasks, for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) { const int tid_y = bid_y + sm_y; - register double tx_reg = den_shared[0][sm_y][threadIdx.x]; - register double ty_reg = den_shared[1][sm_y][threadIdx.x]; - register double tz_reg = den_shared[2][sm_y][threadIdx.x]; + double tx_reg = den_shared[0][sm_y][threadIdx.x]; + double ty_reg = den_shared[1][sm_y][threadIdx.x]; + double tz_reg = den_shared[2][sm_y][threadIdx.x]; // Warp blocks are stored col major - register double tau_reg = 0.0; + double tau_reg = 0.0; tau_reg = 0.5 * hip::warp_reduce_sum( tx_reg ); tau_reg += 0.5 * hip::warp_reduce_sum( ty_reg ); tau_reg += 0.5 * hip::warp_reduce_sum( tz_reg ); - register double lapl_reg = 0.0; + double lapl_reg = 0.0; if constexpr (need_lapl) { lapl_reg = den_shared[3][sm_y][threadIdx.x]; lapl_reg = hip::warp_reduce_sum(lapl_reg); From e9bf3a2b32dd4dd71e9418f564ba9991df7bcae1 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Thu, 18 Jul 2024 22:15:29 +1000 Subject: [PATCH 08/27] Reduced shared mem req --- .../local_work_driver/device/cuda/kernels/uvvars.cu | 4 ++-- .../local_work_driver/device/hip/kernels/uvvars.hip | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu index c42d2182..911b665f 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu @@ -306,7 +306,7 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max, { dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 ); dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), - std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )), + std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )), ntasks ); eval_uvars_gga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); } @@ -330,7 +330,7 @@ void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, { dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 ); dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), - std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )), + std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )), ntasks ); eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); if(do_lapl) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip index d5f1aa6c..052e2ce9 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip @@ -193,7 +193,7 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max, } -#define GGA_KERNEL_SM_BLOCK_Y 32 +#define GGA_KERNEL_SM_BLOCK_Y 16 template __global__ void eval_uvars_mgga_kernel( size_t ntasks, @@ -319,7 +319,7 @@ void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, { dim3 threads( hip::warp_size, hip::max_warps_per_thread_block / 2, 1 ); dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), - std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )), + std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )), ntasks ); eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); if(do_lapl) From e9c616d0391b69ef024ffd6c12d7516ad24ea1c2 Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Fri, 19 Jul 2024 23:32:15 -0700 Subject: [PATCH 09/27] HIP discovery fixes --- CMakeLists.txt | 6 ++++++ cmake/gauxc-config.cmake.in | 14 ++++++++++++++ src/runtime_environment/device/hip/CMakeLists.txt | 5 +++++ 3 files changed, 25 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 57cd29f8..489ae821 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,12 @@ endif() if( GAUXC_ENABLE_HIP ) enable_language( HIP ) set( GAUXC_HAS_HIP TRUE CACHE BOOL "GauXC has HIP and will build HIP bindings" FORCE ) + if(NOT DEFINED ROCM_PATH) + message(FATAL_ERROR "ROCM_PATH must be set") + endif() + if( NOT DEFINED CMAKE_HIP_ARCHITECTURES ) + message( FATAL_ERROR "CMAKE_HIP_ARCHITECTURES must be set" ) + endif() endif() # Decided if we're compiling device bindings diff --git a/cmake/gauxc-config.cmake.in b/cmake/gauxc-config.cmake.in index b036c240..98ff865d 100644 --- a/cmake/gauxc-config.cmake.in +++ b/cmake/gauxc-config.cmake.in @@ -50,6 +50,20 @@ if( GAUXC_HAS_CUDA ) endif() endif() +if( GAUXC_HAS_HIP ) + enable_language( HIP ) + set (CMAKE_HIP_ARCHITECTURES @CMAKE_HIP_ARCHITECTURES@) + set (ROCM_PATH @ROCM_PATH@) + + list (PREPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/hipblas) + set(GPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "AMD GPU targets to compile for") + + find_package( hip REQUIRED ) + find_package( hipblas REQUIRED ) + + list(REMOVE_AT CMAKE_PREFIX_PATH 0 1 2) +endif + if( GAUXC_HAS_MPI ) find_dependency( MPI ) endif() diff --git a/src/runtime_environment/device/hip/CMakeLists.txt b/src/runtime_environment/device/hip/CMakeLists.txt index df97901b..fef39095 100644 --- a/src/runtime_environment/device/hip/CMakeLists.txt +++ b/src/runtime_environment/device/hip/CMakeLists.txt @@ -6,8 +6,13 @@ # See LICENSE.txt for details # +list (PREPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/hipblas) +set(GPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "AMD GPU targets to compile for") + find_package( hip REQUIRED ) find_package( hipblas REQUIRED ) +list(REMOVE_AT CMAKE_PREFIX_PATH 0 1 2) + target_sources( gauxc PRIVATE hip_backend.cxx ) target_link_libraries( gauxc PUBLIC hip::host roc::hipblas ) From b781db38835c156fd364392cde7e53cdda342137 Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Sat, 20 Jul 2024 00:41:04 -0700 Subject: [PATCH 10/27] update readme [skip ci] --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 0699b90b..c039302c 100644 --- a/README.md +++ b/README.md @@ -209,8 +209,10 @@ target_link_libraries( my_target PUBLIC gauxc::gauxc ) | `GAUXC_ENABLE_MPI` | Enable MPI Bindings | `ON` | | `GAUXC_ENABLE_OPENMP` | Enable OpenMP Bindings | `ON` | | `CMAKE_CUDA_ARCHITECTURES` | CUDA architechtures (e.g. 70 for Volta, 80 for Ampere) | -- | +| `CMAKE_HIP_ARCHITECTURES` | HIP architechtures (e.g. gfx90a for MI250X) | -- | | `BLAS_LIBRARIES` | Full BLAS linker. | -- | | `MAGMA_ROOT_DIR` | Install prefix for MAGMA. | -- | +| `ROCM_PATH` | Install prefix for ROCM. | -- | From 08ae3117d3827f5ad2b9fc926242904ca7fb49ef Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Sun, 28 Jul 2024 17:14:01 -0400 Subject: [PATCH 11/27] update ExchEXX hash --- cmake/gauxc-dep-versions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake index cd3969d8..4e1dd9be 100644 --- a/cmake/gauxc-dep-versions.cmake +++ b/cmake/gauxc-dep-versions.cmake @@ -11,7 +11,7 @@ set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git ) set( GAUXC_CUTLASS_REVISION v2.10.0 ) set( GAUXC_EXCHCXX_REPOSITORY https://github.com/wavefunction91/ExchCXX.git ) -set( GAUXC_EXCHCXX_REVISION 21a4700a826ec0beae1311a1d59677393bcb168f ) +set( GAUXC_EXCHCXX_REVISION 7b017151061f155cac341f2d155aaa47e43a8190) set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git ) set( GAUXC_GAU2GRID_REVISION v2.0.6 ) From 5b2273ed6bfd198191ed506d27431136ee345070 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Sat, 21 Sep 2024 21:06:05 +0800 Subject: [PATCH 12/27] Fixed HIP compilation --- .../local_work_driver/device/hip/kernels/uvvars.hip | 9 +++++---- .../device/hip/kernels/zmat_vxc.hip | 4 ++++ .../local_work_driver/device/scheme1_base.cxx | 12 ++++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip index 3a41d424..d7fc4020 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip @@ -57,13 +57,14 @@ __global__ void eval_uvars_lda_kernel( size_t ntasks, } -void eval_uvvars_lda( size_t ntasks, int32_t nbf_max, int32_t npts_max, - XCDeviceTask* device_tasks, device_queue queue ) { - +void eval_uvars_lda( size_t ntasks, int32_t npts_max, + integrator_ks_scheme scheme, + XCDeviceTask* device_tasks, + device_queue queue ) { hipStream_t stream = queue.queue_as(); dim3 threads(hip::warp_size, hip::max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( nbf_max , threads.x ), + dim3 blocks( util::div_ceil( npts_max , threads.x ), util::div_ceil( npts_max , threads.y ), ntasks ); diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip index 39657d2c..c8b09f7e 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip @@ -50,6 +50,8 @@ void zmat_lda_vxc( size_t ntasks, int32_t max_nbf, int32_t max_npts, XCDeviceTask* tasks_device, + integrator_ks_scheme scheme, + density_id sel, device_queue queue ) { hipStream_t stream = queue.queue_as() ; @@ -128,6 +130,8 @@ void zmat_gga_vxc( size_t ntasks, int32_t max_nbf, int32_t max_npts, XCDeviceTask* tasks_device, + integrator_ks_scheme scheme, + density_id sel, device_queue queue ) { hipStream_t stream = queue.queue_as() ; diff --git a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx index d714927c..d33d2b1f 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx +++ b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx @@ -530,8 +530,12 @@ void AoSScheme1Base::eval_uvars_gga( XCDeviceData* _data, integrator_ks_scheme k // Evaluate U variable auto aos_stack = data->aos_stack; +#ifdef GAUXC_HAS_HIP + throw std::runtime_error("uvars_gga not supported with HIP"); +#else GauXC::eval_uvars_gga( ntasks, npts_max, ks_scheme, aos_stack.device_tasks, data->device_backend_->queue() ); +#endif data->device_backend_->check_error("uvvar gga" __FILE__ ": " + std::to_string(__LINE__)); @@ -562,8 +566,12 @@ void AoSScheme1Base::eval_uvars_mgga( XCDeviceData* _data, bool do_lapl ){ // Evaluate U variables auto aos_stack = data->aos_stack; +#ifdef GAUXC_HAS_HIP + throw std::runtime_error("uvars_mgga not supported with HIP"); +#else GauXC::eval_uvars_mgga( ntasks, data->total_npts_task_batch, nbe_max, npts_max, do_lapl, aos_stack.device_tasks, data->device_backend_->queue() ); +#endif data->device_backend_->check_error("uvvar mgga" __FILE__ ": " + std::to_string(__LINE__)); @@ -629,8 +637,12 @@ void AoSScheme1Base::eval_vvar( XCDeviceData* _data, density_id den_select, bool // Evaluate V variable auto aos_stack = data->aos_stack; +#ifdef GAUXC_HAS_HIP + throw std::runtime_error("vvar not supported with HIP"); +#else GauXC::eval_vvar( ntasks, nbe_max, npts_max, do_grad, den_select, aos_stack.device_tasks, data->device_backend_->queue() ); +#endif } From 9d9145dcaefa8cc9919086ef219771ebe9648087 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Sat, 21 Sep 2024 21:06:53 +0800 Subject: [PATCH 13/27] hipblas.h -> hipblas/hipblas.h --- src/exceptions/hipblas_exception.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/exceptions/hipblas_exception.hpp b/src/exceptions/hipblas_exception.hpp index 9bb39011..186d639e 100644 --- a/src/exceptions/hipblas_exception.hpp +++ b/src/exceptions/hipblas_exception.hpp @@ -14,7 +14,7 @@ #ifdef GAUXC_HAS_HIP #include "hip/hip_runtime.h" -#include +#include namespace GauXC { From 1ad6fd435f13f3710f19804e2bc44e1661bef5d0 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Sat, 21 Sep 2024 21:29:19 +0800 Subject: [PATCH 14/27] Renamed SM_BLOCK_Y for cuda compilation --- .../local_work_driver/device/cuda/kernels/uvvars.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu index 6675bfe1..f392757c 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu @@ -457,7 +457,7 @@ void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, { dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 ); dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), - std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )), + std::min(uint64_t(MGGA_KERNEL_SM_BLOCK), util::div_ceil( npts_max, MGGA_KERNEL_SM_BLOCK )), ntasks ); if(do_lapl) eval_uvars_mgga_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); From af72d052829c51a72825af4539b100a6a601224d Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Sun, 22 Sep 2024 11:43:29 +0800 Subject: [PATCH 15/27] Move a bunch of cuda -> hip --- .../device/hip/kernels/uvvars.hip | 619 ++++++++++++++---- .../local_work_driver/device/scheme1_base.cxx | 12 - 2 files changed, 484 insertions(+), 147 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip index d7fc4020..617835b2 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip @@ -13,8 +13,16 @@ namespace GauXC { +#define VVAR_KERNEL_SM_BLOCK 16 +#define GGA_KERNEL_SM_WARPS 16 +#define MGGA_KERNEL_SM_BLOCK 16 -__global__ void eval_uvars_lda_kernel( size_t ntasks, +__global__ void eval_uvars_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + // eval_vvars populated uvar storage already in the case of LDA+RKS + return; +} + +__global__ void eval_uvars_lda_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -23,181 +31,275 @@ __global__ void eval_uvars_lda_kernel( size_t ntasks, auto& task = tasks_device[ batch_idx ]; const auto npts = task.npts; - const auto nbf = task.bfn_screening.nbe; - - auto* den_eval_device = task.den; - - const auto* basis_eval_device = task.bf; - - const auto* den_basis_prod_device = task.zmat; - - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - double den_reg = 0.; - if( tid_x < nbf and tid_y < npts ) { + auto* den_pos_eval_device = task.den_s; + auto* den_neg_eval_device = task.den_z; - const double* bf_col = basis_eval_device + tid_x*npts; - const double* db_col = den_basis_prod_device + tid_x*npts; - den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; - } - - // Warp blocks are stored col major - den_reg = 2 * hip::warp_reduce_sum( den_reg ); + if( tid < npts ) { + const auto ps = den_pos_eval_device[ tid ]; + const auto pz = den_neg_eval_device[ tid ]; + den_pos_eval_device[ tid ] = 0.5*(ps + pz); + den_neg_eval_device[ tid ] = 0.5*(ps - pz); - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); } - - } +__global__ void eval_uvars_lda_gks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { -void eval_uvars_lda( size_t ntasks, int32_t npts_max, - integrator_ks_scheme scheme, - XCDeviceTask* device_tasks, - device_queue queue ) { - hipStream_t stream = queue.queue_as(); + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; - dim3 threads(hip::warp_size, hip::max_warps_per_thread_block, 1); - dim3 blocks( util::div_ceil( npts_max , threads.x ), - util::div_ceil( npts_max , threads.y ), - ntasks ); + auto& task = tasks_device[ batch_idx ]; - hipLaunchKernelGGL(eval_uvars_lda_kernel, dim3(blocks), dim3(threads), 0, - stream, ntasks, device_tasks ); + const auto npts = task.npts; -} + auto* den_z_eval_device = task.den_s; + auto* den_s_eval_device = task.den_z; + auto* den_y_eval_device = task.den_y; + auto* den_x_eval_device = task.den_x; + auto* K_z_eval_device = task.K_z; + auto* K_y_eval_device = task.K_y; + auto* K_x_eval_device = task.K_x; + const double dtolsq = 1e-24; // TODO: make variable + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if( tid < npts ) { + const auto ps = den_s_eval_device[ tid ]; + const auto pz = den_z_eval_device[ tid ]; + const auto py = den_y_eval_device[ tid ]; + const auto px = den_x_eval_device[ tid ]; + const auto mtemp = pz*pz + px*px + py*py; + double mnorm = 0.; + + if (mtemp > dtolsq) { + const double inv_mnorm = rsqrt(mtemp); + mnorm = 1./inv_mnorm; + K_z_eval_device[ tid ] = pz * inv_mnorm; + K_y_eval_device[ tid ] = py * inv_mnorm; + K_x_eval_device[ tid ] = px * inv_mnorm; + } + else { + mnorm = (1. / 3.) * (px + py + pz); + K_z_eval_device[ tid ] = 1. / 3.; + K_y_eval_device[ tid ] = 1. / 3.; + K_x_eval_device[ tid ] = 1. / 3.; + } + den_s_eval_device[ tid ] = 0.5*(ps + mnorm); + den_z_eval_device[ tid ] = 0.5*(ps - mnorm); + } +} +__global__ void eval_uvars_gga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + const auto* dden_sx_eval_device = task.dden_sx; + const auto* dden_sy_eval_device = task.dden_sy; + const auto* dden_sz_eval_device = task.dden_sz; + auto* gamma_eval_device = task.gamma; + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + if( tid < npts ) { + const double dx = dden_sx_eval_device[ tid ]; + const double dy = dden_sy_eval_device[ tid ]; + const double dz = dden_sz_eval_device[ tid ]; + gamma_eval_device[ tid ] = dx*dx + dy*dy + dz*dz; + } +} -__global__ void eval_uvars_gga_kernel( size_t ntasks, - XCDeviceTask* tasks_device ) { +__global__ void eval_uvars_gga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { const int batch_idx = blockIdx.z; if( batch_idx >= ntasks ) return; - auto& task = tasks_device[ batch_idx ]; - + const auto& task = tasks_device[ batch_idx ]; const auto npts = task.npts; - const auto nbf = task.bfn_screening.nbe; - auto* den_eval_device = task.den; - auto* den_x_eval_device = task.den_x; - auto* den_y_eval_device = task.den_y; - auto* den_z_eval_device = task.den_z; + auto* den_pos_eval_device = task.den_s; + const auto* den_pos_x_eval_device = task.dden_sx; + const auto* den_pos_y_eval_device = task.dden_sy; + const auto* den_pos_z_eval_device = task.dden_sz; - const auto* basis_eval_device = task.bf; - const auto* dbasis_x_eval_device = task.dbfx; - const auto* dbasis_y_eval_device = task.dbfy; - const auto* dbasis_z_eval_device = task.dbfz; + auto* den_neg_eval_device = task.den_z; + const auto* den_neg_x_eval_device = task.dden_zx; + const auto* den_neg_y_eval_device = task.dden_zy; + const auto* den_neg_z_eval_device = task.dden_zz; - const auto* den_basis_prod_device = task.zmat; + auto* gamma_pp_eval_device = task.gamma_pp; + auto* gamma_pm_eval_device = task.gamma_pm; + auto* gamma_mm_eval_device = task.gamma_mm; - // We always launch enough blocks to cover npts, so blocks aren't doing multiple results - double den_reg = 0.; - double dx_reg = 0.; - double dy_reg = 0.; - double dz_reg = 0.; - - // Have each thread accumulate its own reduction result into a register. - // There's no real _need_ for LDS because the reductions are small and - // therefore can be done without sharing. - for( int ibf = 0; ibf < nbf; ibf++ ) { - - for( int ipt = blockIdx.x * blockDim.x + threadIdx.x; ipt < npts; ipt += blockDim.x * gridDim.x ) { - - const double* bf_col = basis_eval_device + ibf*npts; - const double* bf_x_col = dbasis_x_eval_device + ibf*npts; - const double* bf_y_col = dbasis_y_eval_device + ibf*npts; - const double* bf_z_col = dbasis_z_eval_device + ibf*npts; - const double* db_col = den_basis_prod_device + ibf*npts; - - den_reg += 2 * bf_col[ ipt ] * db_col[ ipt ]; - dx_reg += 4 * bf_x_col[ ipt ] * db_col[ ipt ]; - dy_reg += 4 * bf_y_col[ ipt ] * db_col[ ipt ]; - dz_reg += 4 * bf_z_col[ ipt ] * db_col[ ipt ]; - } - } + const int tid = blockIdx.x * blockDim.x + threadIdx.x; - - for( int ipt = blockIdx.x * blockDim.x + threadIdx.x; ipt < npts; ipt += blockDim.x * gridDim.x ) { - den_eval_device [ipt] = den_reg; - den_x_eval_device [ipt] = dx_reg ; - den_y_eval_device [ipt] = dy_reg ; - den_z_eval_device [ipt] = dz_reg ; + if( tid < npts ) { + const double ps = den_pos_eval_device[ tid ]; + const double pz = den_neg_eval_device[ tid ]; + const double dndx = den_pos_x_eval_device[ tid ]; + const double dndy = den_pos_y_eval_device[ tid ]; + const double dndz = den_pos_z_eval_device[ tid ]; + const double dMzdx = den_neg_x_eval_device[ tid ]; + const double dMzdy = den_neg_y_eval_device[ tid ]; + const double dMzdz = den_neg_z_eval_device[ tid ]; + + // (del n).(del n) + const auto dn_sq = dndx*dndx + dndy*dndy + dndz*dndz; + // (del Mz).(del Mz) + const auto dMz_sq = dMzdx*dMzdx + dMzdy*dMzdy + dMzdz*dMzdz; + // (del n).(del Mz) + const auto dn_dMz = dndx*dMzdx + dndy*dMzdy + dndz*dMzdz; + + gamma_pp_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) + 0.5*dn_dMz; + gamma_pm_eval_device[ tid ] = 0.25*(dn_sq - dMz_sq); + gamma_mm_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) - 0.5*dn_dMz; + + den_pos_eval_device[ tid ] = 0.5*(ps + pz); + den_neg_eval_device[ tid ] = 0.5*(ps - pz); } } +__global__ void eval_uvars_gga_gks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { -__global__ void eval_vvars_gga_kernel( - size_t npts, - const double* den_x_eval_device, - const double* den_y_eval_device, - const double* den_z_eval_device, - double* gamma_eval_device -) { + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - if( tid < npts ) { + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; - const double dx = den_x_eval_device[ tid ]; - const double dy = den_y_eval_device[ tid ]; - const double dz = den_z_eval_device[ tid ]; + auto* den_s_eval_device = task.den_s; + const auto* dden_sx_eval_device = task.dden_sx; + const auto* dden_sy_eval_device = task.dden_sy; + const auto* dden_sz_eval_device = task.dden_sz; - gamma_eval_device[tid] = dx*dx + dy*dy + dz*dz; + auto* den_z_eval_device = task.den_z; + const auto* dden_zx_eval_device = task.dden_zx; + const auto* dden_zy_eval_device = task.dden_zy; + const auto* dden_zz_eval_device = task.dden_zz; - } + const auto* den_y_eval_device = task.den_y; + const auto* dden_yx_eval_device = task.dden_yx; + const auto* dden_yy_eval_device = task.dden_yy; + const auto* dden_yz_eval_device = task.dden_yz; -} + const auto* den_x_eval_device = task.den_x; + const auto* dden_xx_eval_device = task.dden_xx; + const auto* dden_xy_eval_device = task.dden_xy; + const auto* dden_xz_eval_device = task.dden_xz; + auto* gamma_pp_eval_device = task.gamma_pp; + auto* gamma_pm_eval_device = task.gamma_pm; + auto* gamma_mm_eval_device = task.gamma_mm; + auto* H_z_eval_device = task.H_z; + auto* H_y_eval_device = task.H_y; + auto* H_x_eval_device = task.H_x; + auto* K_z_eval_device = task.K_z; + auto* K_y_eval_device = task.K_y; + auto* K_x_eval_device = task.K_x; + const double dtolsq = 1e-24; // TODO: make variable -void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max, - int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, - const double* deny, const double* denz, double* gamma, device_queue queue ) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; - hipStream_t stream = queue.queue_as(); + if( tid < npts ) { + const double dndz = dden_sz_eval_device[ tid ]; + const double dndy = dden_sy_eval_device[ tid ]; + const double dndx = dden_sx_eval_device[ tid ]; + + const double dMzdz = dden_zz_eval_device[ tid ]; + const double dMzdy = dden_zy_eval_device[ tid ]; + const double dMzdx = dden_zx_eval_device[ tid ]; + + const double dMydz = dden_yz_eval_device[ tid ]; + const double dMydy = dden_yy_eval_device[ tid ]; + const double dMydx = dden_yx_eval_device[ tid ]; + + const double dMxdz = dden_xz_eval_device[ tid ]; + const double dMxdy = dden_xy_eval_device[ tid ]; + const double dMxdx = dden_xx_eval_device[ tid ]; + + const auto ps = den_s_eval_device[ tid ]; + const auto pz = den_z_eval_device[ tid ]; + const auto py = den_y_eval_device[ tid ]; + const auto px = den_x_eval_device[ tid ]; + + const auto mtemp = pz*pz + px*px + py*py; + double mnorm = 0.; + + const auto dels_dot_dels = dndx * dndx + dndy * dndy + dndz * dndz; + const auto delz_dot_delz = dMzdx * dMzdx + dMzdy * dMzdy + dMzdz * dMzdz; + const auto delx_dot_delx = dMxdx * dMxdx + dMxdy * dMxdy + dMxdz * dMxdz; + const auto dely_dot_dely = dMydx * dMydx + dMydy * dMydy + dMydz * dMydz; + + const auto dels_dot_delz = dndx * dMzdx + dndy * dMzdy + dndz * dMzdz; + const auto dels_dot_delx = dndx * dMxdx + dndy * dMxdy + dndz * dMxdz; + const auto dels_dot_dely = dndx * dMydx + dndy * dMydy + dndz * dMydz; + + const auto sum = delz_dot_delz + delx_dot_delx + dely_dot_dely; + const auto s_sum = + dels_dot_delz * pz + dels_dot_delx * px + dels_dot_dely * py; + + const auto inv_sqsum2 = + rsqrt(dels_dot_delz * dels_dot_delz + dels_dot_delx * dels_dot_delx + + dels_dot_dely * dels_dot_dely); + const auto sqsum2 = 1./inv_sqsum2; + + double sign = 1.; + if( signbit(s_sum)) + sign = -1.; + + + if (mtemp > dtolsq) { + const double inv_mnorm = rsqrt(mtemp); + mnorm = 1./inv_mnorm; + K_z_eval_device[ tid ] = pz * inv_mnorm; + K_y_eval_device[ tid ] = py * inv_mnorm; + K_x_eval_device[ tid ] = px * inv_mnorm; + H_z_eval_device[ tid ] = sign * dels_dot_delz * inv_sqsum2; + H_y_eval_device[ tid ] = sign * dels_dot_dely * inv_sqsum2; + H_x_eval_device[ tid ] = sign * dels_dot_delx * inv_sqsum2; + } + else { + mnorm = (1. / 3.) * (px + py + pz); + K_z_eval_device[ tid ] = 1. / 3.; + K_y_eval_device[ tid ] = 1. / 3.; + K_x_eval_device[ tid ] = 1. / 3.; + + H_z_eval_device[ tid ] = sign / 3.; + H_y_eval_device[ tid ] = sign / 3.; + H_x_eval_device[ tid ] = sign / 3.; + } - // U Variables - { - dim3 threads(hip::max_threads_per_thread_block, 1, 1); - dim3 blocks( util::div_ceil( npts_max , threads.x ), - 1, - ntasks ); + gamma_pp_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) + 0.5*sign*sqsum2; + gamma_pm_eval_device[ tid ] = 0.25*(dels_dot_dels - sum); + gamma_mm_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) - 0.5*sign*sqsum2; - hipLaunchKernelGGL(eval_uvars_gga_kernel, dim3(blocks), dim3(threads), 0, - stream, ntasks, device_tasks ); - } + den_s_eval_device[ tid ] = 0.5*(ps + mnorm); + den_z_eval_device[ tid ] = 0.5*(ps - mnorm); - // V Variables - dim3 threads( hip::max_threads_per_thread_block ); - dim3 blocks( util::div_ceil( npts_total, threads.x ) ); - hipLaunchKernelGGL(eval_vvars_gga_kernel, blocks, threads, 0, stream, - npts_total, denx, deny, denz, gamma); + } } -#define GGA_KERNEL_SM_BLOCK_Y 16 - template -__global__ void eval_uvars_mgga_kernel( size_t ntasks, +__global__ void eval_uvars_mgga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { constexpr auto warp_size = hip::warp_size; @@ -235,17 +337,17 @@ __global__ void eval_uvars_mgga_kernel( size_t ntasks, den_basis_prod_device = task.zmat; } - __shared__ double den_shared[3+!!need_lapl][warp_size][GGA_KERNEL_SM_BLOCK_Y+1]; + __shared__ double den_shared[3+!!need_lapl][warp_size][MGGA_KERNEL_SM_BLOCK+1]; for ( int bid_x = blockIdx.x * blockDim.x; bid_x < nbf; bid_x += blockDim.x * gridDim.x ) { - for ( int bid_y = blockIdx.y * GGA_KERNEL_SM_BLOCK_Y; + for ( int bid_y = blockIdx.y * MGGA_KERNEL_SM_BLOCK; bid_y < npts; - bid_y += GGA_KERNEL_SM_BLOCK_Y * gridDim.y ) { + bid_y += MGGA_KERNEL_SM_BLOCK * gridDim.y ) { - for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) { + for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) { den_shared[0][threadIdx.x][sm_y] = 0.; den_shared[1][threadIdx.x][sm_y] = 0.; den_shared[2][threadIdx.x][sm_y] = 0.; @@ -277,7 +379,7 @@ __global__ void eval_uvars_mgga_kernel( size_t ntasks, __syncthreads(); - for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) { + for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) { const int tid_y = bid_y + sm_y; double tx_reg = den_shared[0][sm_y][threadIdx.x]; @@ -309,33 +411,280 @@ __global__ void eval_uvars_mgga_kernel( size_t ntasks, } -void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, - int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, - const double* deny, const double* denz, double* gamma, bool do_lapl, - device_queue queue ) { +#define EVAL_UVARS_KERNEL(xc_approx) \ + hipStream_t stream = queue.queue_as(); \ + dim3 blocks( util::div_ceil( npts_max, threads.x ), \ + 1, \ + ntasks ); \ + switch ( ks_scheme ) { \ + case RKS: \ + eval_uvars_##xc_approx##_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ + break; \ + case UKS: \ + eval_uvars_##xc_approx##_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ + break; \ + case GKS: \ + eval_uvars_##xc_approx##_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ + break; \ + default: \ + GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate UV vars" ); \ + } + +void eval_uvars_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + XCDeviceTask* device_tasks, device_queue queue ) { + dim3 threads( hip::max_warps_per_thread_block * hip::warp_size, 1, 1 ); + EVAL_UVARS_KERNEL(lda); +} + + +void eval_uvars_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + XCDeviceTask* device_tasks, device_queue queue ) { + dim3 threads( GGA_KERNEL_SM_WARPS * hip::warp_size, 1, 1 ); + EVAL_UVARS_KERNEL(gga); +} + + + +void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, + int32_t npts_max, bool do_lapl, XCDeviceTask* device_tasks, + device_queue queue ) { + // TODO: This interface should be unified with the lda/gga interfaces hipStream_t stream = queue.queue_as(); // U Variables { dim3 threads( hip::warp_size, hip::max_warps_per_thread_block / 2, 1 ); dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), - std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )), + std::min(uint64_t(MGGA_KERNEL_SM_BLOCK), util::div_ceil( npts_max, MGGA_KERNEL_SM_BLOCK )), ntasks ); - eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); if(do_lapl) - eval_uvars_mgga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + eval_uvars_mgga_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); else - eval_uvars_mgga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + eval_uvars_mgga_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); } // V variables (GAMMA) dim3 threads( hip::max_threads_per_thread_block ); - dim3 blocks( util::div_ceil( npts_total, threads.x ) ); - eval_vvars_gga_kernel<<< blocks, threads, 0, stream >>>( - npts_total, denx, deny, denz, gamma - ); + dim3 blocks( util::div_ceil( npts_total, threads.x ), + 1, + ntasks ); + eval_uvars_gga_rks_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); } + + + + + +template +__global__ void eval_vvar_grad_kern( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + double* den_eval_device = nullptr; + double* den_x_eval_device = nullptr; + double* den_y_eval_device = nullptr; + double* den_z_eval_device = nullptr; + + constexpr auto warp_size = hip::warp_size; + + if constexpr (den_select == DEN_S) { + den_eval_device = task.den_s; + den_x_eval_device = task.dden_sx; + den_y_eval_device = task.dden_sy; + den_z_eval_device = task.dden_sz; + } + if constexpr (den_select == DEN_Z) { + den_eval_device = task.den_z; + den_x_eval_device = task.dden_zx; + den_y_eval_device = task.dden_zy; + den_z_eval_device = task.dden_zz; + } + if constexpr (den_select == DEN_Y) { + den_eval_device = task.den_y; + den_x_eval_device = task.dden_yx; + den_y_eval_device = task.dden_yy; + den_z_eval_device = task.dden_yz; + } + if constexpr (den_select == DEN_X) { + den_eval_device = task.den_x; + den_x_eval_device = task.dden_xx; + den_y_eval_device = task.dden_xy; + den_z_eval_device = task.dden_xz; + } + + const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + + const auto* den_basis_prod_device = task.zmat; + + __shared__ double den_shared[4][warp_size][VVAR_KERNEL_SM_BLOCK+1]; + + for ( int bid_x = blockIdx.x * blockDim.x; + bid_x < nbf; + bid_x += blockDim.x * gridDim.x ) { + + for ( int bid_y = blockIdx.y * VVAR_KERNEL_SM_BLOCK; + bid_y < npts; + bid_y += VVAR_KERNEL_SM_BLOCK * gridDim.y ) { + + for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) { + den_shared[0][threadIdx.x][sm_y] = 0.; + den_shared[1][threadIdx.x][sm_y] = 0.; + den_shared[2][threadIdx.x][sm_y] = 0.; + den_shared[3][threadIdx.x][sm_y] = 0.; + + if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { + const double* db_col = den_basis_prod_device + (bid_x + sm_y)*npts; + const double* bf_col = basis_eval_device + (bid_x + sm_y)*npts; + const double* bf_x_col = dbasis_x_eval_device + (bid_x + sm_y)*npts; + const double* bf_y_col = dbasis_y_eval_device + (bid_x + sm_y)*npts; + const double* bf_z_col = dbasis_z_eval_device + (bid_x + sm_y)*npts; + + den_shared[0][threadIdx.x][sm_y] = bf_col [ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + den_shared[1][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + den_shared[2][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + den_shared[3][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + } + } + __syncthreads(); + + + for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) { + const int tid_y = bid_y + sm_y; + double den_reg = den_shared[0][sm_y][threadIdx.x]; + double dx_reg = den_shared[1][sm_y][threadIdx.x]; + double dy_reg = den_shared[2][sm_y][threadIdx.x]; + double dz_reg = den_shared[3][sm_y][threadIdx.x]; + + // Warp blocks are stored col major + den_reg = hip::warp_reduce_sum( den_reg ); + dx_reg = 2. * hip::warp_reduce_sum( dx_reg ); + dy_reg = 2. * hip::warp_reduce_sum( dy_reg ); + dz_reg = 2. * hip::warp_reduce_sum( dz_reg ); + + + if( threadIdx.x == 0 and tid_y < npts ) { + atomicAdd( den_eval_device + tid_y, den_reg ); + atomicAdd( den_x_eval_device + tid_y, dx_reg ); + atomicAdd( den_y_eval_device + tid_y, dy_reg ); + atomicAdd( den_z_eval_device + tid_y, dz_reg ); + } + } + __syncthreads(); + } + } + +} + + + +template +__global__ void eval_vvar_kern( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + double* den_eval_device = nullptr; + // use the "U" variable (+/- for UKS) even though at this point the density (S/Z) is stored + if constexpr (den_select == DEN_S) den_eval_device = task.den_s; + if constexpr (den_select == DEN_Z) den_eval_device = task.den_z; + if constexpr (den_select == DEN_Y) den_eval_device = task.den_y; + if constexpr (den_select == DEN_X) den_eval_device = task.den_x; + + const auto* basis_eval_device = task.bf; + + const auto* den_basis_prod_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + double den_reg = 0.; + + if( tid_x < nbf and tid_y < npts ) { + + const double* bf_col = basis_eval_device + tid_x*npts; + const double* db_col = den_basis_prod_device + tid_x*npts; + + den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; + + } + + // Warp blocks are stored col major + constexpr auto warp_size = hip::warp_size; + //constexpr auto max_warps_per_thread_block = hip::max_warps_per_thread_block; + den_reg = hip::warp_reduce_sum( den_reg ); + + + if( threadIdx.x == 0 and tid_y < npts ) { + atomicAdd( den_eval_device + tid_y, den_reg ); + } + + +} + + + + +void eval_vvar( size_t ntasks, int32_t nbf_max, int32_t npts_max, bool do_grad, density_id den_select, + XCDeviceTask* device_tasks, device_queue queue ) { + + hipStream_t stream = queue.queue_as(); + dim3 threads; + dim3 blocks; + if( do_grad ) { + threads = dim3( hip::warp_size, hip::max_warps_per_thread_block / 2, 1 ); + blocks = dim3( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), + std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )), + ntasks ); + } else { + threads = dim3( hip::warp_size, hip::max_warps_per_thread_block, 1 ); + blocks = dim3( util::div_ceil( nbf_max, threads.x ), + util::div_ceil( npts_max, threads.y ), + ntasks ); + } + switch( den_select ) { + case DEN_S: + if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + break; + case DEN_Z: + if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + break; + case DEN_Y: + if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + break; + case DEN_X: + if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + break; + default: + GAUXC_GENERIC_EXCEPTION( "eval_vvar called with improper density selected" ); + } + +} + + + + + } diff --git a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx index d33d2b1f..d714927c 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx +++ b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx @@ -530,12 +530,8 @@ void AoSScheme1Base::eval_uvars_gga( XCDeviceData* _data, integrator_ks_scheme k // Evaluate U variable auto aos_stack = data->aos_stack; -#ifdef GAUXC_HAS_HIP - throw std::runtime_error("uvars_gga not supported with HIP"); -#else GauXC::eval_uvars_gga( ntasks, npts_max, ks_scheme, aos_stack.device_tasks, data->device_backend_->queue() ); -#endif data->device_backend_->check_error("uvvar gga" __FILE__ ": " + std::to_string(__LINE__)); @@ -566,12 +562,8 @@ void AoSScheme1Base::eval_uvars_mgga( XCDeviceData* _data, bool do_lapl ){ // Evaluate U variables auto aos_stack = data->aos_stack; -#ifdef GAUXC_HAS_HIP - throw std::runtime_error("uvars_mgga not supported with HIP"); -#else GauXC::eval_uvars_mgga( ntasks, data->total_npts_task_batch, nbe_max, npts_max, do_lapl, aos_stack.device_tasks, data->device_backend_->queue() ); -#endif data->device_backend_->check_error("uvvar mgga" __FILE__ ": " + std::to_string(__LINE__)); @@ -637,12 +629,8 @@ void AoSScheme1Base::eval_vvar( XCDeviceData* _data, density_id den_select, bool // Evaluate V variable auto aos_stack = data->aos_stack; -#ifdef GAUXC_HAS_HIP - throw std::runtime_error("vvar not supported with HIP"); -#else GauXC::eval_vvar( ntasks, nbe_max, npts_max, do_grad, den_select, aos_stack.device_tasks, data->device_backend_->queue() ); -#endif } From 7c26939550b47d3126bce17092d6a70b59013a41 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Thu, 26 Sep 2024 10:31:55 +0800 Subject: [PATCH 16/27] Allow passing additional flags to obara saika host compilation Helps with NVHPC segfault --- .../local_work_driver/host/obara_saika/CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt b/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt index b103b4d4..7da96629 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt +++ b/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt @@ -29,6 +29,16 @@ set( GAUXC_OBARA_SAIKA_HOST_SRC src/obara_saika_integrals.cxx src/chebyshev_boys_computation.cxx ) + +# This is required for compilation with NVHPC as it crashes with the O3 optimization flag +if (DEFINED GAUXC_OBARA_SAIKA_COMPILE_OPTIMIZATION_OPTIONS) + foreach (flag "[\\/\\-]O3" "[\\/\\-]Ofast" "[\\/\\-]fast") + string(REGEX REPLACE ${flag} ${GAUXC_OBARA_SAIKA_COMPILE_OPTIMIZATION_OPTIONS} CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") + endforeach() + message("Setting Obara-Saika CMAKE_CXX_FLAGS_RELEASE to: ${CMAKE_CXX_FLAGS_RELEASE}") +endif() + + target_sources( gauxc PRIVATE ${GAUXC_OBARA_SAIKA_HOST_SRC} ) target_include_directories( gauxc PUBLIC $ From 2bb4783a08b5b39fa7429cad68fe5ed9ef52d37a Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Thu, 26 Sep 2024 22:20:12 +0800 Subject: [PATCH 17/27] Moved obara saika compile flags override --- src/CMakeLists.txt | 16 ++++++++++++++++ .../host/obara_saika/CMakeLists.txt | 9 --------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7e51e9fa..6dc291fc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -224,3 +224,19 @@ install( FILES # Install Custom Find Modules include( ${linalg-cmake-modules_SOURCE_DIR}/LinAlgModulesMacros.cmake ) install_linalg_modules( INSTALL_CONFIGDIR ) + +# This allows specifying a lower compiler optimization level for NVHPC which fails to compile with the -O3 flag whilst leaving the remaining flags unchanged +if (DEFINED GAUXC_OBARA_SAIKA_COMPILE_OPTIMIZATION_OPTIONS) + get_target_property(default_compile_options gauxc COMPILE_OPTIONS) + get_target_property(gauxc_sources gauxc SOURCES) + set_target_properties(gauxc PROPERTIES COMPILE_OPTIONS "") + set_source_files_properties(${gauxc_sources} PROPERTIES COMPILE_OPTIONS "${default_compile_options}") + + file(GLOB OB_HOST_SRC_FILES ${CMAKE_CURRENT_LIST_DIR}/xc_integrator/local_work_driver/host/obara_saika/src/*.cxx) + set(adjusted_compile_options ${default_compile_options}) + foreach (flag "[\\/\\-]O3" "[\\/\\-]Ofast" "[\\/\\-]fast") + string(REGEX REPLACE ${flag} ${GAUXC_OBARA_SAIKA_COMPILE_OPTIMIZATION_OPTIONS} adjusted_compile_options "${adjusted_compile_options}") + endforeach() + message("-- Setting Obara-Saika COMPILE_OPTIONS to: ${adjusted_compile_options}") + set_source_files_properties(${OB_HOST_SRC_FILES} PROPERTIES COMPILE_OPTIONS "${adjusted_compile_options}") +endif() diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt b/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt index 7da96629..c37f0ebe 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt +++ b/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt @@ -30,15 +30,6 @@ set( GAUXC_OBARA_SAIKA_HOST_SRC src/chebyshev_boys_computation.cxx ) -# This is required for compilation with NVHPC as it crashes with the O3 optimization flag -if (DEFINED GAUXC_OBARA_SAIKA_COMPILE_OPTIMIZATION_OPTIONS) - foreach (flag "[\\/\\-]O3" "[\\/\\-]Ofast" "[\\/\\-]fast") - string(REGEX REPLACE ${flag} ${GAUXC_OBARA_SAIKA_COMPILE_OPTIMIZATION_OPTIONS} CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") - endforeach() - message("Setting Obara-Saika CMAKE_CXX_FLAGS_RELEASE to: ${CMAKE_CXX_FLAGS_RELEASE}") -endif() - - target_sources( gauxc PRIVATE ${GAUXC_OBARA_SAIKA_HOST_SRC} ) target_include_directories( gauxc PUBLIC $ From ecf6eacc411a4cfe09e6f8e5b3b2855264e0ffb7 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Mon, 30 Sep 2024 20:17:50 +1000 Subject: [PATCH 18/27] Compiling HIP on NVIDIA --- cmake/gauxc-dep-versions.cmake | 4 ++-- .../local_work_driver/device/hip/kernels/hip_ssh_2d.hip | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake index 4e1dd9be..9a2c0757 100644 --- a/cmake/gauxc-dep-versions.cmake +++ b/cmake/gauxc-dep-versions.cmake @@ -10,8 +10,8 @@ set( GAUXC_CUB_REVISION 1.10.0 ) set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git ) set( GAUXC_CUTLASS_REVISION v2.10.0 ) -set( GAUXC_EXCHCXX_REPOSITORY https://github.com/wavefunction91/ExchCXX.git ) -set( GAUXC_EXCHCXX_REVISION 7b017151061f155cac341f2d155aaa47e43a8190) +set( GAUXC_EXCHCXX_REPOSITORY https://github.com/ryanstocks00/ExchCXX.git ) +set( GAUXC_EXCHCXX_REVISION 8a0004609afc710bdad4367867026a9daa0a758b) set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git ) set( GAUXC_GAU2GRID_REVISION v2.0.6 ) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip index d4f6eda9..727f6061 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip @@ -122,7 +122,12 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, int cont = (iCenter < natoms); // We will continue iterating until all of the threads have cont set to 0 + +#ifdef __HIP_PLATFORM_NVIDIA__ + while (__any_sync(__activemask(), cont)) { +#else while (__any(cont)) { +#endif if (cont) { double2 rj[weight_unroll/2]; double2 rab_val[weight_unroll/2]; From 23c78e98a829830a0cf55bd90169272e885a5140 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Tue, 8 Oct 2024 15:53:13 +1100 Subject: [PATCH 19/27] Pseudofunctional HIP on NVIDIA --- .../device_specific/hip_device_constants.hpp | 6 +- .../collocation_angular_spherical_unnorm.hpp | 81 +++++++++++++++++++ .../collocation_device_constants.hpp | 7 +- .../device/hip/kernels/hip_extensions.hpp | 22 ++++- .../device/hip/kernels/hip_ssh_2d.hip | 7 +- .../device/hip/kernels/uvvars.hip | 2 +- 6 files changed, 115 insertions(+), 10 deletions(-) diff --git a/src/runtime_environment/device_specific/hip_device_constants.hpp b/src/runtime_environment/device_specific/hip_device_constants.hpp index 3a79fdf3..07adbd38 100644 --- a/src/runtime_environment/device_specific/hip_device_constants.hpp +++ b/src/runtime_environment/device_specific/hip_device_constants.hpp @@ -11,8 +11,12 @@ namespace GauXC { namespace hip { +#ifdef __HIP_PLATFORM_NVIDIA__ +static constexpr uint32_t warp_size = 32; +#else static constexpr uint32_t warp_size = 64; -static constexpr uint32_t max_threads_per_thread_block = 1024; +#endif +static constexpr uint32_t max_threads_per_thread_block = 512; static constexpr uint32_t max_warps_per_thread_block = max_threads_per_thread_block / warp_size; diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp index 5a5e78a5..9c2b4859 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp @@ -211,6 +211,76 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_3_deriv1( } +template +GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_4( + int32_t npts, + const T bf, + const T x, + const T y, + const T z, + T* __restrict__ eval +) { + + eval[npts * 0] = sqrt_35*bf*x*y*(x*x - y*y)/2; + eval[npts * 1] = sqrt_70*bf*y*z*(3*x*x - y*y)/4; + eval[npts * 2] = sqrt_5*bf*x*y*(-x*x - y*y + 6*z*z)/2; + eval[npts * 3] = sqrt_10*bf*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + eval[npts * 4] = bf*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval[npts * 5] = sqrt_10*bf*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; + eval[npts * 6] = sqrt_5*bf*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; + eval[npts * 7] = sqrt_70*bf*x*z*(x*x - 3*y*y)/4; + eval[npts * 8] = sqrt_35*bf*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + +} + +template +GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_4_deriv1( + const int32_t npts, + const T bf, + const T bf_x, + const T bf_y, + const T bf_z, + const T x, + const T y, + const T z, + T* __restrict__ eval_x, + T* __restrict__ eval_y, + T* __restrict__ eval_z +) { + + eval_x[npts * 0] = sqrt_35*y*(bf*(3*x*x - y*y) + bf_x*x*(x*x - y*y))/2; + eval_x[npts * 1] = sqrt_70*y*z*(6*bf*x + bf_x*(3*x*x - y*y))/4; + eval_x[npts * 2] = sqrt_5*y*(-bf*(3*x*x + y*y - 6*z*z) - bf_x*x*(x*x + y*y - 6*z*z))/2; + eval_x[npts * 3] = sqrt_10*y*z*(-6*bf*x - bf_x*(3*x*x + 3*y*y - 4*z*z))/4; + eval_x[npts * 4] = 3*bf*x*(x*x + y*y - 4*z*z)/2 + bf_x*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval_x[npts * 5] = sqrt_10*z*(-bf*(9*x*x + 3*y*y - 4*z*z) - bf_x*x*(3*x*x + 3*y*y - 4*z*z))/4; + eval_x[npts * 6] = sqrt_5*(-bf*x*(x*x - 3*z*z) - bf_x*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z)/4); + eval_x[npts * 7] = sqrt_70*z*(3*bf*(x*x - y*y) + bf_x*x*(x*x - 3*y*y))/4; + eval_x[npts * 8] = sqrt_35*(4*bf*x*(x*x - 3*y*y) + bf_x*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + eval_y[npts * 0] = sqrt_35*x*(-bf*(-x*x + 3*y*y) + bf_y*y*(x*x - y*y))/2; + eval_y[npts * 1] = sqrt_70*z*(-3*bf*(-x*x + y*y) + bf_y*y*(3*x*x - y*y))/4; + eval_y[npts * 2] = sqrt_5*x*(-bf*(x*x + 3*y*y - 6*z*z) - bf_y*y*(x*x + y*y - 6*z*z))/2; + eval_y[npts * 3] = sqrt_10*z*(-bf*(3*x*x + 9*y*y - 4*z*z) - bf_y*y*(3*x*x + 3*y*y - 4*z*z))/4; + eval_y[npts * 4] = 3*bf*y*(x*x + y*y - 4*z*z)/2 + bf_y*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval_y[npts * 5] = sqrt_10*x*z*(-6*bf*y - bf_y*(3*x*x + 3*y*y - 4*z*z))/4; + eval_y[npts * 6] = sqrt_5*(bf*y*(y*y - 3*z*z) - bf_y*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z)/4); + eval_y[npts * 7] = sqrt_70*x*z*(-6*bf*y + bf_y*(x*x - 3*y*y))/4; + eval_y[npts * 8] = sqrt_35*(-4*bf*y*(3*x*x - y*y) + bf_y*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + + eval_z[npts * 0] = sqrt_35*bf_z*x*y*(x*x - y*y)/2; + eval_z[npts * 1] = sqrt_70*y*(bf + bf_z*z)*(3*x*x - y*y)/4; + eval_z[npts * 2] = sqrt_5*x*y*(12*bf*z - bf_z*(x*x + y*y - 6*z*z))/2; + eval_z[npts * 3] = sqrt_10*y*(3*bf*(-x*x - y*y + 4*z*z) - bf_z*z*(3*x*x + 3*y*y - 4*z*z))/4; + eval_z[npts * 4] = -2*bf*z*(3*x*x + 3*y*y - 2*z*z) + bf_z*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; + eval_z[npts * 5] = sqrt_10*x*(3*bf*(-x*x - y*y + 4*z*z) - bf_z*z*(3*x*x + 3*y*y - 4*z*z))/4; + eval_z[npts * 6] = sqrt_5*(12*bf*z*(x*x - y*y) - bf_z*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; + eval_z[npts * 7] = sqrt_70*x*(bf + bf_z*z)*(x*x - 3*y*y)/4; + eval_z[npts * 8] = sqrt_35*bf_z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + +} + + template GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular( @@ -239,8 +309,14 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular( collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); + } else if( l == 4 ) { + + collocation_spherical_unnorm_angular_4( npts, bf, x, y, z, eval ); + } else { + assert( false && "L < L_MAX" ); + } } // collocation_spherical_unnorm_angular @@ -284,6 +360,11 @@ GPGAUEVAL_INLINE __device__ void collocation_spherical_unnorm_angular_deriv1( collocation_spherical_unnorm_angular_3( npts, bf, x, y, z, eval ); collocation_spherical_unnorm_angular_3_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); + } else if( l == 4 ) { + + collocation_spherical_unnorm_angular_4( npts, bf, x, y, z, eval ); + collocation_spherical_unnorm_angular_4_deriv1( npts, bf, bf_x, bf_y, bf_z, x, y, z, eval_x, eval_y, eval_z ); + } else { assert( false && "L < L_MAX" ); } diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp index c7405df7..30d702a5 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp @@ -9,9 +9,12 @@ namespace GauXC { - constexpr double sqrt_15 = 3.872983346207417; constexpr double sqrt_3 = 1.7320508075688772; - constexpr double sqrt_6 = 2.449489742783178; + constexpr double sqrt_5 = 2.23606797749979; + constexpr double sqrt_15 = 3.872983346207417; constexpr double sqrt_10 = 3.1622776601683795; + constexpr double sqrt_6 = 2.449489742783178; + constexpr double sqrt_35 = 5.916079783099616; + constexpr double sqrt_70 = 8.366600265340756; } // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp index 7ba02d5a..e23086cd 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp @@ -16,31 +16,45 @@ namespace hip { template __device__ T warp_reduce_sum( T val ) { +#ifdef __HIP_PLATFORM_NVIDIA__ + for(int i=(warp_sz/2); i>=1; i/=2) + val += __shfl_xor_sync(0xffffffff, val, i, warp_sz); + + return val; +#else using warp_reducer = hipcub::WarpReduce; - static __shared__ typename warp_reducer::TempStorage + static __shared__ typename warp_reducer::TempStorage temp_storage[hip::max_warps_per_thread_block]; - int tid = + int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; int warp_lane = tid / warp_size; return warp_reducer( temp_storage[warp_lane] ).Sum( val ); +#endif } template __device__ T warp_reduce_prod( T val ) { +#ifdef __HIP_PLATFORM_NVIDIA__ + for(int i=(warp_sz/2); i>=1; i/=2) + val *= __shfl_xor_sync(0xffffffff, val, i, warp_sz); + + return val; +#else using warp_reducer = hipcub::WarpReduce; - static __shared__ typename warp_reducer::TempStorage + static __shared__ typename warp_reducer::TempStorage temp_storage[hip::max_warps_per_thread_block]; - int tid = + int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; int warp_lane = tid / warp_size; return warp_reducer( temp_storage[warp_lane] ).Reduce( val, [](const T& a, const T& b){ return a * b; } ); +#endif } diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip index 727f6061..555c71df 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip @@ -99,6 +99,7 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, if( parent_weight < eps_d ) { if (threadIdx.x == 0) weights[ipt] = 0.; + __syncwarp(); continue; } @@ -106,6 +107,7 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, if (threadIdx.x == 0) { jCounter[0] = 0; } + __syncwarp(); // Each thread will process an iCenter. Atomic operations are used to assign // an iCenter value to each thread. @@ -137,7 +139,7 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, #pragma unroll for (int k = 0; k < weight_unroll/2; k++) { rj[k] = *((double2*)(local_dist_scratch + jCenter) + k); - rab_val[k] = *((double2*)(local_rab + jCenter) + k); + rab_val[k] = *((double2*)(local_rab + jCenter) + k); } #pragma unroll @@ -175,6 +177,7 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, jCenter = (jCenter < ldRAB) ? jCenter : 0; } + __syncwarp(); // All of the threads then sum their contributions. Only thread 0 needs to add the parent // contribution. sum = hip::warp_reduce_sum(sum); @@ -182,7 +185,7 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, sum += parent_weight; weights[ipt] *= parent_weight / sum; } - + __syncwarp(); #endif } diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip index 617835b2..2087dd87 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip @@ -14,7 +14,7 @@ namespace GauXC { #define VVAR_KERNEL_SM_BLOCK 16 -#define GGA_KERNEL_SM_WARPS 16 +#define GGA_KERNEL_SM_WARPS 8 #define MGGA_KERNEL_SM_BLOCK 16 __global__ void eval_uvars_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { From 031fb0aab32c24eb98df1a0869a6f1cc56f1b683 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Tue, 8 Oct 2024 17:04:15 +1100 Subject: [PATCH 20/27] Fixed mem access violation --- .../device/hip/kernels/hip_ssh_2d.hip | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip index 555c71df..31ec8ca5 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip @@ -138,8 +138,16 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, #pragma unroll for (int k = 0; k < weight_unroll/2; k++) { - rj[k] = *((double2*)(local_dist_scratch + jCenter) + k); - rab_val[k] = *((double2*)(local_rab + jCenter) + k); + double2* addr = (double2*)(local_dist_scratch + jCenter) + k; + rj[k].x = addr->x; + rj[k].y = addr->y; + double2* addr2 = (double2*)(local_rab + jCenter) + k; + rab_val[k].x = addr2->x; + rab_val[k].y = addr2->y; + // These caused a memory access violation when lddist is not a + // multiple of 2 as then there can be an unaligned access + // rj[k] = *((double2*)(local_dist_scratch + jCenter) + k); + // rab_val[k] = *((double2*)(local_rab + jCenter) + k); } #pragma unroll From eeff1054e85e5d904b715fd620f54dc8fd828a82 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Tue, 8 Oct 2024 23:24:30 +1100 Subject: [PATCH 21/27] Copy zmat from cuda --- .../device/hip/kernels/zmat_vxc.hip | 436 ++++++++++++++++-- 1 file changed, 400 insertions(+), 36 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip index c8b09f7e..ac1fcd57 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip @@ -14,7 +14,7 @@ namespace GauXC { -__global__ void zmat_lda_vxc_kernel( size_t ntasks, +__global__ void zmat_lda_vxc_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -46,36 +46,93 @@ __global__ void zmat_lda_vxc_kernel( size_t ntasks, -void zmat_lda_vxc( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCDeviceTask* tasks_device, - integrator_ks_scheme scheme, - density_id sel, - device_queue queue ) { - hipStream_t stream = queue.queue_as() ; - dim3 threads(hip::warp_size,hip::max_warps_per_thread_block,1); - dim3 blocks( util::div_ceil( max_npts, threads.x ), - util::div_ceil( max_nbf, threads.y ), - ntasks ); - hipLaunchKernelGGL(zmat_lda_vxc_kernel, dim3(blocks), dim3(threads), 0, stream , ntasks, tasks_device ); + + + + + +template +__global__ void zmat_lda_vxc_uks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + const double* vrho_pos_device = task.vrho_pos; + const double* vrho_neg_device = task.vrho_neg; + + + const auto* basis_eval_device = task.bf; + + + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + const double factp = 0.5 * vrho_pos_device[tid_x]; + const double factm = 0.5 * vrho_neg_device[tid_x]; + double sign = 1.0; + if constexpr ( den_selector == DEN_Z ) sign = -1.0; + + z_matrix_device[ ibfoff ] = 0.5*(factp * basis_eval_device[ ibfoff ] + sign * factm * basis_eval_device[ ibfoff ]); + } } +template +__global__ void zmat_lda_vxc_gks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + const double* vrho_pos_device = task.vrho_pos; + const double* vrho_neg_device = task.vrho_neg; + double* K_device; + if constexpr ( den_selector == DEN_Z ) K_device = task.K_z; + if constexpr ( den_selector == DEN_Y ) K_device = task.K_y; + if constexpr ( den_selector == DEN_X ) K_device = task.K_x; + + const auto* basis_eval_device = task.bf; + auto* z_matrix_device = task.zmat; + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + if( tid_x < npts and tid_y < nbf ) { + const size_t ibfoff = tid_y * npts + tid_x; + const double factp = 0.5 * vrho_pos_device[tid_x]; + const double factm = 0.5 * vrho_neg_device[tid_x]; + if constexpr ( den_selector == DEN_S ) { + z_matrix_device[ ibfoff ] = 0.5*(factp * basis_eval_device[ ibfoff ] + factm * basis_eval_device[ ibfoff ]); + } + else { + const double factk = 0.5 * (factp - factm); + z_matrix_device[ ibfoff ] = K_device[ ibfoff ] * factk * basis_eval_device[ ibfoff ]; + } + } +} @@ -85,7 +142,7 @@ void zmat_lda_vxc( size_t ntasks, -__global__ void zmat_gga_vxc_kernel( size_t ntasks, +__global__ void zmat_gga_vxc_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -126,29 +183,219 @@ __global__ void zmat_gga_vxc_kernel( size_t ntasks, } } -void zmat_gga_vxc( size_t ntasks, - int32_t max_nbf, - int32_t max_npts, - XCDeviceTask* tasks_device, - integrator_ks_scheme scheme, - density_id sel, - device_queue queue ) { - hipStream_t stream = queue.queue_as() ; - dim3 threads(hip::warp_size,hip::max_warps_per_thread_block,1); - dim3 blocks( util::div_ceil( max_npts, threads.x ), - util::div_ceil( max_nbf, threads.y ), - ntasks ); - hipLaunchKernelGGL(zmat_gga_vxc_kernel, dim3(blocks), dim3(threads), 0, stream , ntasks, tasks_device ); + + +template +__global__ void zmat_gga_vxc_uks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + const double* vrho_pos_device = task.vrho_pos; + const double* vrho_neg_device = task.vrho_neg; + const double* vgamma_pp_device = task.vgamma_pp; + const double* vgamma_pm_device = task.vgamma_pm; + const double* vgamma_mm_device = task.vgamma_mm; + + const auto* den_pos_x_eval_device = task.dden_sx; + const auto* den_pos_y_eval_device = task.dden_sy; + const auto* den_pos_z_eval_device = task.dden_sz; + const auto* den_neg_x_eval_device = task.dden_zx; + const auto* den_neg_y_eval_device = task.dden_zy; + const auto* den_neg_z_eval_device = task.dden_zz; + + + const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + + const double factp = 0.25 * vrho_pos_device[tid_x]; + const double factm = 0.25 * vrho_neg_device[tid_x]; + + const auto gga_fact_pp = vgamma_pp_device[tid_x]; + const auto gga_fact_pm = vgamma_pm_device[tid_x]; + const auto gga_fact_mm = vgamma_mm_device[tid_x]; + + const auto gga_fact_1 = 0.5*(gga_fact_pp + gga_fact_pm + gga_fact_mm); + const auto gga_fact_2 = 0.5*(gga_fact_pp - gga_fact_mm); + const auto gga_fact_3 = 0.5*(gga_fact_pp - gga_fact_pm + gga_fact_mm); + + double sign = 1.0; + + double x_fact, y_fact, z_fact; + + if constexpr ( den_selector == DEN_S ) { + x_fact = gga_fact_1 * den_pos_x_eval_device[ tid_x ] + gga_fact_2 * den_neg_x_eval_device[ tid_x ]; + y_fact = gga_fact_1 * den_pos_y_eval_device[ tid_x ] + gga_fact_2 * den_neg_y_eval_device[ tid_x ]; + z_fact = gga_fact_1 * den_pos_z_eval_device[ tid_x ] + gga_fact_2 * den_neg_z_eval_device[ tid_x ]; + + + } + if constexpr ( den_selector == DEN_Z ) { + sign = -1.0; + x_fact = gga_fact_3 * den_neg_x_eval_device[ tid_x ] + gga_fact_2 * den_pos_x_eval_device[ tid_x ]; + y_fact = gga_fact_3 * den_neg_y_eval_device[ tid_x ] + gga_fact_2 * den_pos_y_eval_device[ tid_x ]; + z_fact = gga_fact_3 * den_neg_z_eval_device[ tid_x ] + gga_fact_2 * den_pos_z_eval_device[ tid_x ]; + + } + + z_matrix_device[ ibfoff ] = x_fact * dbasis_x_eval_device[ ibfoff ] + + y_fact * dbasis_y_eval_device[ ibfoff ] + + z_fact * dbasis_z_eval_device[ ibfoff ] + + (factp + sign * factm) * basis_eval_device[ ibfoff ]; + } +} + + + + +template +__global__ void zmat_gga_vxc_gks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + const double* vrho_pos_device = task.vrho_pos; + const double* vrho_neg_device = task.vrho_neg; + const double* vgamma_pp_device = task.vgamma_pp; + const double* vgamma_pm_device = task.vgamma_pm; + const double* vgamma_mm_device = task.vgamma_mm; + + + // for non-DEN_S + double* K_device; + double* H_device; + if constexpr ( den_selector == DEN_Z ) { K_device = task.K_z; H_device = task.H_z; } + if constexpr ( den_selector == DEN_Y ) { K_device = task.K_y; H_device = task.H_y; } + if constexpr ( den_selector == DEN_X ) { K_device = task.K_x; H_device = task.H_x; } + + const auto* dden_sx_eval_device = task.dden_sx; + const auto* dden_sy_eval_device = task.dden_sy; + const auto* dden_sz_eval_device = task.dden_sz; + const auto* dden_zx_eval_device = task.dden_zx; + const auto* dden_zy_eval_device = task.dden_zy; + const auto* dden_zz_eval_device = task.dden_zz; + const auto* dden_yx_eval_device = task.dden_yx; + const auto* dden_yy_eval_device = task.dden_yy; + const auto* dden_yz_eval_device = task.dden_yz; + const auto* dden_xx_eval_device = task.dden_xx; + const auto* dden_xy_eval_device = task.dden_xy; + const auto* dden_xz_eval_device = task.dden_xz; + + + const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + + const double fact_p = 0.5*vrho_pos_device[tid_x]; + const double fact_m = 0.5*vrho_neg_device[tid_x]; + + const auto gga_fact_pp = vgamma_pp_device[tid_x]; + const auto gga_fact_pm = vgamma_pm_device[tid_x]; + const auto gga_fact_mm = vgamma_mm_device[tid_x]; + + const auto gga_fact_1 = 0.5*(gga_fact_pp + gga_fact_pm + gga_fact_mm); + const auto gga_fact_2 = 0.5*(gga_fact_pp - gga_fact_mm); + const auto gga_fact_3 = 0.5*(gga_fact_pp - gga_fact_pm + gga_fact_mm); + + double s_fact, x_fact, y_fact, z_fact; + + if constexpr ( den_selector == DEN_S ) { + const double* Hz_device = task.H_z; + const double* Hy_device = task.H_y; + const double* Hx_device = task.H_x; + + s_fact = 0.5 * (fact_p + fact_m); + + x_fact = gga_fact_1 * dden_sx_eval_device[ tid_x ] + + gga_fact_2 * (Hz_device[ tid_x ] * dden_zx_eval_device[ tid_x ] + + Hy_device[ tid_x ] * dden_yx_eval_device[ tid_x ] + + Hx_device[ tid_x ] * dden_xx_eval_device[ tid_x ] ); + y_fact = gga_fact_1 * dden_sy_eval_device[ tid_x ] + + gga_fact_2 * (Hz_device[ tid_x ] * dden_zy_eval_device[ tid_x ] + + Hy_device[ tid_x ] * dden_yy_eval_device[ tid_x ] + + Hx_device[ tid_x ] * dden_xy_eval_device[ tid_x ] ); + z_fact = gga_fact_1 * dden_sz_eval_device[ tid_x ] + + gga_fact_2 * (Hz_device[ tid_x ] * dden_zz_eval_device[ tid_x ] + + Hy_device[ tid_x ] * dden_yz_eval_device[ tid_x ] + + Hx_device[ tid_x ] * dden_xz_eval_device[ tid_x ] ); + } + + if constexpr ( den_selector == DEN_Z ) { + s_fact = K_device[ tid_x ] * 0.5 * (fact_p - fact_m); + x_fact = gga_fact_3 * dden_zx_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sx_eval_device[ tid_x ]; + y_fact = gga_fact_3 * dden_zy_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sy_eval_device[ tid_x ]; + z_fact = gga_fact_3 * dden_zz_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sz_eval_device[ tid_x ]; + } + + if constexpr ( den_selector == DEN_Y ) { + s_fact = K_device[ tid_x ] * 0.5 * (fact_p - fact_m); + x_fact = gga_fact_3 * dden_yx_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sx_eval_device[ tid_x ]; + y_fact = gga_fact_3 * dden_yy_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sy_eval_device[ tid_x ]; + z_fact = gga_fact_3 * dden_yz_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sz_eval_device[ tid_x ]; + } + + if constexpr ( den_selector == DEN_X ) { + s_fact = K_device[ tid_x ] * 0.5 * (fact_p - fact_m); + x_fact = gga_fact_3 * dden_xx_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sx_eval_device[ tid_x ]; + y_fact = gga_fact_3 * dden_xy_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sy_eval_device[ tid_x ]; + z_fact = gga_fact_3 * dden_xz_eval_device[ tid_x ] + + gga_fact_2 * H_device[ tid_x ] * dden_sz_eval_device[ tid_x ]; + } + + z_matrix_device[ ibfoff ] = x_fact * dbasis_x_eval_device[ ibfoff ] + + y_fact * dbasis_y_eval_device[ ibfoff ] + + z_fact * dbasis_z_eval_device[ ibfoff ] + + s_fact * basis_eval_device[ ibfoff ]; + + } } - template -__global__ void zmat_mgga_vxc_kernel( size_t ntasks, +__global__ void zmat_mgga_vxc_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -198,6 +445,60 @@ __global__ void zmat_mgga_vxc_kernel( size_t ntasks, } } + + +#define ZMAT_VXC_KERN(xc_approx) \ + hipStream_t stream = queue.queue_as(); \ + dim3 threads(hip::warp_size,hip::max_warps_per_thread_block,1); \ + dim3 blocks( util::div_ceil( max_npts, threads.x ), \ + util::div_ceil( max_nbf, threads.y ), \ + ntasks ); \ + switch( scheme ) { \ + case RKS: \ + zmat_##xc_approx##_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + break; \ + case UKS: \ + if ( sel == DEN_S ) zmat_##xc_approx##_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else if ( sel == DEN_Z ) zmat_##xc_approx##_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else GAUXC_GENERIC_EXCEPTION( "zmat_##xc_approx##_vxc invalid density" ); \ + break; \ + case GKS: \ + if ( sel == DEN_S ) zmat_##xc_approx##_vxc_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else if ( sel == DEN_Z ) zmat_##xc_approx##_vxc_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else if ( sel == DEN_Y ) zmat_##xc_approx##_vxc_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else if ( sel == DEN_X ) zmat_##xc_approx##_vxc_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else GAUXC_GENERIC_EXCEPTION( "zmat_##xc_approx##_vxc invalid density" ); \ + break; \ + default: \ + GAUXC_GENERIC_EXCEPTION( "zmat_##xc_approx##_vxc invalid KS scheme" ); \ + } + + + +void zmat_lda_vxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + integrator_ks_scheme scheme, + density_id sel, + device_queue queue ) { +ZMAT_VXC_KERN(lda) +} + + + +void zmat_gga_vxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + integrator_ks_scheme scheme, + density_id sel, + device_queue queue ) { +ZMAT_VXC_KERN(gga) +} + + + void zmat_mgga_vxc( size_t ntasks, int32_t max_nbf, int32_t max_npts, @@ -214,15 +515,29 @@ void zmat_mgga_vxc( size_t ntasks, ntasks ); if(do_lapl) - zmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + zmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); else - zmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + zmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); } + + + + + + + + + + + + + + template -__global__ void mmat_mgga_vxc_kernel( size_t ntasks, +__global__ void mmat_mgga_vxc_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { const int batch_idx = blockIdx.z; @@ -257,6 +572,55 @@ __global__ void mmat_mgga_vxc_kernel( size_t ntasks, } } +//__global__ void print_zmat_stats( size_t ntasks, +// XCDeviceTask* tasks_device) { +// +// for(size_t iT = 0; iT < ntasks; ++iT) { +// auto& task = tasks_device[iT]; +// const auto npts = task.npts; +// const auto nbf = task.bfn_screening.nbe; +// +// const auto* zmat = task.zmat; +// const auto* bmat = task.bf; +// const auto* blmat = task.d2bflapl; +// +// double znrm = 0.0, bnrm = 0.0, blnrm = 0.0; +// for(auto j = 0; j < npts*nbf; ++j) { +// znrm += zmat[j] * zmat[j]; +// bnrm += bmat[j] * bmat[j]; +// blnrm += blmat[j] * blmat[j]; +// } +// +// const auto* eps = task.eps; +// const auto* vgamma = task.vgamma; +// const auto* vtau = task.vtau; +// const auto* vlapl = task.vlapl; +// const auto* vrho = task.vrho; +// const auto* gamma = task.gamma; +// const auto* tau = task.tau; +// const auto* lapl = task.denlapl; +// const auto* rho = task.den; +// double enrm = 0.0, gnrm = 0.0, tnrm = 0.0, rnrm = 0.0, lnrm = 0.0; +// double vgnrm = 0.0, vtnrm = 0.0, vrnrm = 0.0, vlnrm = 0.0; +// for(auto j = 0; j < npts; ++j) { +// enrm += eps[j] * eps[j]; +// vrnrm += vrho[j] * vrho[j]; +// vgnrm += vgamma[j] * vgamma[j]; +// vtnrm += vtau[j] * vtau[j]; +// vlnrm += vlapl[j] * vlapl[j]; +// +// rnrm += rho[j] * rho[j]; +// gnrm += gamma[j] * gamma[j]; +// tnrm += tau[j] * tau[j]; +// lnrm += lapl[j] * lapl[j]; +// } +// +// printf("ITASK = %lu B = %.6e BL = %.6e R = %.6e G = %.6e T = %.6e L = %.6e E = %.6e VR = %.6e VG = %6e VT = %.6e VL = %.6e Z = %.6e \n", +// iT, bnrm, blnrm, rnrm, gnrm, tnrm, lnrm, enrm, vrnrm, vgnrm, vtnrm, vlnrm, znrm); +// } +// +//} + void mmat_mgga_vxc( size_t ntasks, int32_t max_nbf, int32_t max_npts, @@ -273,12 +637,12 @@ void mmat_mgga_vxc( size_t ntasks, ntasks ); if(do_lapl) - mmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + mmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); else - mmat_mgga_vxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + mmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + //print_zmat_stats<<<1,1,0,stream>>>(ntasks,tasks_device); } - } From 2089af617a9da5a81d147b3fd4a96c103be88e64 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Wed, 9 Oct 2024 14:39:47 +1100 Subject: [PATCH 22/27] Small refactor of cuda vvar kernel to support any grid/block dims --- .../device/cuda/kernels/uvvars.cu | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu index f392757c..71e8c387 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu @@ -614,17 +614,23 @@ __global__ void eval_vvar_kern( size_t ntasks, const auto* den_basis_prod_device = task.zmat; - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - register double den_reg = 0.; - if( tid_x < nbf and tid_y < npts ) { + int start_y = blockIdx.y * blockDim.y + threadIdx.y; + + for (int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + tid_x < nbf; + tid_x += blockDim.x * gridDim.x ) { + + for (int tid_y = start_y; + tid_y < npts; + tid_y += blockDim.y * gridDim.y ) { - const double* bf_col = basis_eval_device + tid_x*npts; - const double* db_col = den_basis_prod_device + tid_x*npts; + const double* bf_col = basis_eval_device + tid_x*npts; + const double* db_col = den_basis_prod_device + tid_x*npts; - den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; + den_reg += bf_col[ tid_y ] * db_col[ tid_y ]; + } } @@ -634,8 +640,8 @@ __global__ void eval_vvar_kern( size_t ntasks, den_reg = cuda::warp_reduce_sum( den_reg ); - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); + if( threadIdx.x == 0 and start_y < npts ) { + atomicAdd( den_eval_device + start_y, den_reg ); } From d4675df8893ff42c19e74f77baa1c9d160a4b600 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Wed, 9 Oct 2024 14:44:59 +1100 Subject: [PATCH 23/27] Revert SM block size changes --- .../local_work_driver/device/hip/kernels/uvvars.hip | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip index 2087dd87..9bd07ac2 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip @@ -13,9 +13,9 @@ namespace GauXC { -#define VVAR_KERNEL_SM_BLOCK 16 -#define GGA_KERNEL_SM_WARPS 8 -#define MGGA_KERNEL_SM_BLOCK 16 +#define VVAR_KERNEL_SM_BLOCK 32 +#define GGA_KERNEL_SM_WARPS 16 +#define MGGA_KERNEL_SM_BLOCK 32 __global__ void eval_uvars_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { // eval_vvars populated uvar storage already in the case of LDA+RKS From bfd880330424668aea3689ca10584cb7ec6c469b Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Wed, 9 Oct 2024 16:27:32 +1100 Subject: [PATCH 24/27] More forceful double instead of double2 --- .../device/hip/kernels/hip_ssh_2d.hip | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip index 31ec8ca5..cefb738d 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip @@ -138,12 +138,12 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, #pragma unroll for (int k = 0; k < weight_unroll/2; k++) { - double2* addr = (double2*)(local_dist_scratch + jCenter) + k; - rj[k].x = addr->x; - rj[k].y = addr->y; - double2* addr2 = (double2*)(local_rab + jCenter) + k; - rab_val[k].x = addr2->x; - rab_val[k].y = addr2->y; + double* addr = (double*)((double2*)(local_dist_scratch + jCenter) + k); + rj[k].x = addr[0]; + rj[k].y = addr[1]; + double* addr2 = (double*)((double2*)(local_rab + jCenter) + k); + rab_val[k].x = addr2[0]; + rab_val[k].y = addr2[1]; // These caused a memory access violation when lddist is not a // multiple of 2 as then there can be an unaligned access // rj[k] = *((double2*)(local_dist_scratch + jCenter) + k); From f0b1a5196713295da485b00d0b63f5082664e341 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Mon, 14 Oct 2024 11:09:40 +1100 Subject: [PATCH 25/27] AMD compilation --- .../local_work_driver/device/hip/kernels/hip_ssh_2d.hip | 4 ---- .../local_work_driver/device/hip/kernels/uvvars.hip | 6 ++++++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip index cefb738d..ce2c06e4 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip @@ -99,7 +99,6 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, if( parent_weight < eps_d ) { if (threadIdx.x == 0) weights[ipt] = 0.; - __syncwarp(); continue; } @@ -107,7 +106,6 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, if (threadIdx.x == 0) { jCounter[0] = 0; } - __syncwarp(); // Each thread will process an iCenter. Atomic operations are used to assign // an iCenter value to each thread. @@ -185,7 +183,6 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, jCenter = (jCenter < ldRAB) ? jCenter : 0; } - __syncwarp(); // All of the threads then sum their contributions. Only thread 0 needs to add the parent // contribution. sum = hip::warp_reduce_sum(sum); @@ -193,7 +190,6 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, sum += parent_weight; weights[ipt] *= parent_weight / sum; } - __syncwarp(); #endif } diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip index 9bd07ac2..478f0fe6 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip @@ -13,9 +13,15 @@ namespace GauXC { +#ifdef __HIP_PLATFORM_NVIDIA__ #define VVAR_KERNEL_SM_BLOCK 32 #define GGA_KERNEL_SM_WARPS 16 #define MGGA_KERNEL_SM_BLOCK 32 +#else +#define VVAR_KERNEL_SM_BLOCK 16 +#define GGA_KERNEL_SM_WARPS 8 +#define MGGA_KERNEL_SM_BLOCK 16 +#endif __global__ void eval_uvars_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { // eval_vvars populated uvar storage already in the case of LDA+RKS From 5882e885e97933793bd79eb98bc4386e8356f5da Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Mon, 30 Jun 2025 21:11:06 +0800 Subject: [PATCH 26/27] Remove shmem to fix bug in HIP vvar_grad kernel --- .../device/hip/kernels/uvvars.hip | 82 ++++++------------- 1 file changed, 27 insertions(+), 55 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip index 478f0fe6..4d7707fa 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip @@ -14,11 +14,9 @@ namespace GauXC { #ifdef __HIP_PLATFORM_NVIDIA__ -#define VVAR_KERNEL_SM_BLOCK 32 #define GGA_KERNEL_SM_WARPS 16 #define MGGA_KERNEL_SM_BLOCK 32 #else -#define VVAR_KERNEL_SM_BLOCK 16 #define GGA_KERNEL_SM_WARPS 8 #define MGGA_KERNEL_SM_BLOCK 16 #endif @@ -501,8 +499,6 @@ __global__ void eval_vvar_grad_kern( size_t ntasks, double* den_y_eval_device = nullptr; double* den_z_eval_device = nullptr; - constexpr auto warp_size = hip::warp_size; - if constexpr (den_select == DEN_S) { den_eval_device = task.den_s; den_x_eval_device = task.dden_sx; @@ -535,63 +531,39 @@ __global__ void eval_vvar_grad_kern( size_t ntasks, const auto* den_basis_prod_device = task.zmat; - __shared__ double den_shared[4][warp_size][VVAR_KERNEL_SM_BLOCK+1]; + double den_reg = 0.; + double dx_reg = 0.; + double dy_reg = 0.; + double dz_reg = 0.; - for ( int bid_x = blockIdx.x * blockDim.x; - bid_x < nbf; - bid_x += blockDim.x * gridDim.x ) { - - for ( int bid_y = blockIdx.y * VVAR_KERNEL_SM_BLOCK; - bid_y < npts; - bid_y += VVAR_KERNEL_SM_BLOCK * gridDim.y ) { - - for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) { - den_shared[0][threadIdx.x][sm_y] = 0.; - den_shared[1][threadIdx.x][sm_y] = 0.; - den_shared[2][threadIdx.x][sm_y] = 0.; - den_shared[3][threadIdx.x][sm_y] = 0.; + int ipt = blockIdx.x * blockDim.x + threadIdx.x; - if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { - const double* db_col = den_basis_prod_device + (bid_x + sm_y)*npts; - const double* bf_col = basis_eval_device + (bid_x + sm_y)*npts; - const double* bf_x_col = dbasis_x_eval_device + (bid_x + sm_y)*npts; - const double* bf_y_col = dbasis_y_eval_device + (bid_x + sm_y)*npts; - const double* bf_z_col = dbasis_z_eval_device + (bid_x + sm_y)*npts; + if (ipt < npts) { - den_shared[0][threadIdx.x][sm_y] = bf_col [ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[1][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[2][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[3][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - } - } - __syncthreads(); + // Have each thread accumulate its own reduction result into a register. + // There's no real _need_ for LDS because the reductions are small and + // therefore can be done without sharing. + for( int ibf = 0; ibf < nbf; ibf++ ) { - for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) { - const int tid_y = bid_y + sm_y; - double den_reg = den_shared[0][sm_y][threadIdx.x]; - double dx_reg = den_shared[1][sm_y][threadIdx.x]; - double dy_reg = den_shared[2][sm_y][threadIdx.x]; - double dz_reg = den_shared[3][sm_y][threadIdx.x]; + const double* bf_col = basis_eval_device + ibf*npts; + const double* bf_x_col = dbasis_x_eval_device + ibf*npts; + const double* bf_y_col = dbasis_y_eval_device + ibf*npts; + const double* bf_z_col = dbasis_z_eval_device + ibf*npts; + const double* db_col = den_basis_prod_device + ibf*npts; - // Warp blocks are stored col major - den_reg = hip::warp_reduce_sum( den_reg ); - dx_reg = 2. * hip::warp_reduce_sum( dx_reg ); - dy_reg = 2. * hip::warp_reduce_sum( dy_reg ); - dz_reg = 2. * hip::warp_reduce_sum( dz_reg ); + den_reg += bf_col[ ipt ] * db_col[ ipt ]; + dx_reg += 2 * bf_x_col[ ipt ] * db_col[ ipt ]; + dy_reg += 2 * bf_y_col[ ipt ] * db_col[ ipt ]; + dz_reg += 2 * bf_z_col[ ipt ] * db_col[ ipt ]; + } - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); - atomicAdd( den_x_eval_device + tid_y, dx_reg ); - atomicAdd( den_y_eval_device + tid_y, dy_reg ); - atomicAdd( den_z_eval_device + tid_y, dz_reg ); - } - } - __syncthreads(); - } + den_eval_device [ipt] = den_reg; + den_x_eval_device [ipt] = dx_reg ; + den_y_eval_device [ipt] = dy_reg ; + den_z_eval_device [ipt] = dz_reg ; } - } @@ -656,9 +628,9 @@ void eval_vvar( size_t ntasks, int32_t nbf_max, int32_t npts_max, bool do_grad, dim3 threads; dim3 blocks; if( do_grad ) { - threads = dim3( hip::warp_size, hip::max_warps_per_thread_block / 2, 1 ); - blocks = dim3( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), - std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )), + threads = dim3(hip::max_warps_per_thread_block, 1, 1); + blocks = dim3( util::div_ceil( npts_max, threads.x ), + 1, ntasks ); } else { threads = dim3( hip::warp_size, hip::max_warps_per_thread_block, 1 ); From 39498ddbdcaf682a6a53542a5f1e1a1baf4f9042 Mon Sep 17 00:00:00 2001 From: Ryan Stocks Date: Wed, 30 Jul 2025 05:05:34 +0800 Subject: [PATCH 27/27] Fix MGGA on AMD --- .../device/hip/kernels/uvvars.hip | 98 ++++++------------- 1 file changed, 28 insertions(+), 70 deletions(-) diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip index 4d7707fa..976d4d80 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip @@ -13,13 +13,7 @@ namespace GauXC { -#ifdef __HIP_PLATFORM_NVIDIA__ #define GGA_KERNEL_SM_WARPS 16 -#define MGGA_KERNEL_SM_BLOCK 32 -#else -#define GGA_KERNEL_SM_WARPS 8 -#define MGGA_KERNEL_SM_BLOCK 16 -#endif __global__ void eval_uvars_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { // eval_vvars populated uvar storage already in the case of LDA+RKS @@ -341,76 +335,40 @@ __global__ void eval_uvars_mgga_rks_kernel( size_t ntasks, den_basis_prod_device = task.zmat; } - __shared__ double den_shared[3+!!need_lapl][warp_size][MGGA_KERNEL_SM_BLOCK+1]; + int ipt = blockIdx.x * blockDim.x + threadIdx.x; + if (ipt < npts) { + double tau_reg = 0; + double lapl_reg = 0; - for ( int bid_x = blockIdx.x * blockDim.x; - bid_x < nbf; - bid_x += blockDim.x * gridDim.x ) { - - for ( int bid_y = blockIdx.y * MGGA_KERNEL_SM_BLOCK; - bid_y < npts; - bid_y += MGGA_KERNEL_SM_BLOCK * gridDim.y ) { - - for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) { - den_shared[0][threadIdx.x][sm_y] = 0.; - den_shared[1][threadIdx.x][sm_y] = 0.; - den_shared[2][threadIdx.x][sm_y] = 0.; - if constexpr (need_lapl) - den_shared[3][threadIdx.x][sm_y] = 0.; + for( int ibf = 0; ibf < nbf; ibf++ ) { - if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { - const double* db_x_col = den_basis_dx_prod_device + (bid_x + sm_y)*npts; - const double* db_y_col = den_basis_dy_prod_device + (bid_x + sm_y)*npts; - const double* db_z_col = den_basis_dz_prod_device + (bid_x + sm_y)*npts; + const double* db_x_col = den_basis_dx_prod_device + ibf*npts; + const double* db_y_col = den_basis_dy_prod_device + ibf*npts; + const double* db_z_col = den_basis_dz_prod_device + ibf*npts; - const double* bf_x_col = dbasis_x_eval_device + (bid_x + sm_y)*npts; - const double* bf_y_col = dbasis_y_eval_device + (bid_x + sm_y)*npts; - const double* bf_z_col = dbasis_z_eval_device + (bid_x + sm_y)*npts; + const double* bf_x_col = dbasis_x_eval_device + ibf*npts; + const double* bf_y_col = dbasis_y_eval_device + ibf*npts; + const double* bf_z_col = dbasis_z_eval_device + ibf*npts; - den_shared[0][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_x_col[ bid_y + threadIdx.x ]; - den_shared[1][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_y_col[ bid_y + threadIdx.x ]; - den_shared[2][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_z_col[ bid_y + threadIdx.x ]; + tau_reg += bf_x_col[ipt] * db_x_col[ipt]; + tau_reg += bf_y_col[ipt] * db_y_col[ipt]; + tau_reg += bf_z_col[ipt] * db_z_col[ipt]; if constexpr (need_lapl) { - const double* db_col = den_basis_prod_device + (bid_x + sm_y)*npts; - const double* bf_l_col = basis_lapl_eval_device + (bid_x + sm_y)*npts; - den_shared[3][threadIdx.x][sm_y] = bf_l_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - } - } - } - __syncthreads(); - - - for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) { - const int tid_y = bid_y + sm_y; - - double tx_reg = den_shared[0][sm_y][threadIdx.x]; - double ty_reg = den_shared[1][sm_y][threadIdx.x]; - double tz_reg = den_shared[2][sm_y][threadIdx.x]; - // Warp blocks are stored col major - double tau_reg = 0.0; - tau_reg = 0.5 * hip::warp_reduce_sum( tx_reg ); - tau_reg += 0.5 * hip::warp_reduce_sum( ty_reg ); - tau_reg += 0.5 * hip::warp_reduce_sum( tz_reg ); - - double lapl_reg = 0.0; - if constexpr (need_lapl) { - lapl_reg = den_shared[3][sm_y][threadIdx.x]; - lapl_reg = hip::warp_reduce_sum(lapl_reg); - lapl_reg = 2. * lapl_reg + 4. * tau_reg; - } - - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( tau_eval_device + tid_y, tau_reg ); - if constexpr (need_lapl) { - atomicAdd( lapl_eval_device + tid_y, lapl_reg ); + const double* db_col = den_basis_prod_device + ibf*npts; + const double* bf_l_col = basis_lapl_eval_device + ibf*npts; + lapl_reg += bf_l_col[ipt] * db_col[ipt]; } - } } - __syncthreads(); - } + + + tau_reg = 0.5 * tau_reg; + lapl_reg = 2. * lapl_reg + 4. * tau_reg; + + tau_eval_device[ipt] = tau_reg; + lapl_eval_device[ipt] = lapl_reg; } } @@ -458,10 +416,10 @@ void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, // U Variables { - dim3 threads( hip::warp_size, hip::max_warps_per_thread_block / 2, 1 ); - dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), - std::min(uint64_t(MGGA_KERNEL_SM_BLOCK), util::div_ceil( npts_max, MGGA_KERNEL_SM_BLOCK )), - ntasks ); + dim3 threads(hip::max_warps_per_thread_block, 1,1 ); + dim3 blocks = dim3( util::div_ceil( npts_max, threads.x ), + 1, + ntasks ); if(do_lapl) eval_uvars_mgga_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); else