From bdf253790286fcb32f11dc1b6ad3a3c40bf76667 Mon Sep 17 00:00:00 2001 From: yilabs Date: Wed, 24 Aug 2022 12:58:17 -0400 Subject: [PATCH 1/2] add twed_malloc_dvc, twed_free_dvc, twed_copy_to_dvc --- cuTWED/cuTWED.h.i | 10 ++++++++++ src/cuTWED.cu | 12 ++++++++++++ src/cuTWED.h | 10 ++++++++++ src/cuTWED_core.h | 42 ++++++++++++++++++++++++------------------ 4 files changed, 56 insertions(+), 18 deletions(-) diff --git a/cuTWED/cuTWED.h.i b/cuTWED/cuTWED.h.i index 9e0968c..5c165db 100644 --- a/cuTWED/cuTWED.h.i +++ b/cuTWED/cuTWED.h.i @@ -13,19 +13,29 @@ float twed_devf(float A_dev[], int nA, float TA_dev[], float B_dev[], int nB, float TB_dev[], float nu, float lambda, int degree, int dim); + void twed_malloc_dvc(const int nA, double **A_dev, double **TA_dev, + const int dim, const int nAA); void twed_malloc_dev(const int nA, double **A_dev, double **TA_dev, const int nB, double **B_dev, double **TB_dev, const int dim, const int nAA, const int nBB); + void twed_malloc_dvcf(const int nA, float **A_dev, float **TA_dev, + const int dim, const int nAA); void twed_malloc_devf(const int nA, float **A_dev, float **TA_dev, const int nB, float **B_dev, float **TB_dev, const int dim, const int nAA, const int nBB); + void twed_free_dvc(double *A_dev, double *TA_dev); void twed_free_dev(double *A_dev, double *TA_dev, double *B_dev, double *TB_dev); + void twed_free_dvcf(float *A_dev, float *TA_dev); void twed_free_devf(float *A_dev, float *TA_dev, float *B_dev, float *TB_dev); + void twed_copy_to_dvc(const int nA, double A[], double A_dev[], double TA[], double TA_dev[], + const int dim, const int nAA); void twed_copy_to_dev(const int nA, double A[], double A_dev[], double TA[], double TA_dev[], const int nB, double B[], double B_dev[], double TB[], double TB_dev[], const int dim, const int nAA, const int nBB); + void twed_copy_to_dvcf(const int nA, float A[], float A_dev[], float TA[], float TA_dev[], + const int dim, const int nAA); void twed_copy_to_devf(const int nA, float A[], float A_dev[], float TA[], float TA_dev[], const int nB, float B[], float B_dev[], float TB[], float TB_dev[], const int dim, const int nAA, const int nBB); diff --git a/src/cuTWED.cu b/src/cuTWED.cu index dd9a05b..4aba8a3 100644 --- a/src/cuTWED.cu +++ b/src/cuTWED.cu @@ -118,8 +118,11 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col) */ #define REAL_t double +#define _TWED_MALLOC_DVC twed_malloc_dvc #define _TWED_MALLOC_DEV twed_malloc_dev +#define _TWED_FREE_DVC twed_free_dvc #define _TWED_FREE_DEV twed_free_dev +#define _TWED_COPY_TO_DVC twed_copy_to_dvc #define _TWED_COPY_TO_DEV twed_copy_to_dev #define _TWED_DEV twed_dev #define _TWED twed @@ -128,8 +131,11 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col) #define _GEAM cublasDgeam #include "cuTWED_core.h" #undef REAL_t +#undef _TWED_MALLOC_DVC #undef _TWED_MALLOC_DEV +#undef _TWED_FREE_DVC #undef _TWED_FREE_DEV +#undef _TWED_COPY_TO_DVC #undef _TWED_COPY_TO_DEV #undef _TWED_DEV #undef _TWED @@ -138,8 +144,11 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col) #undef _GEAM #define REAL_t float +#define _TWED_MALLOC_DVC twed_malloc_dvcf #define _TWED_MALLOC_DEV twed_malloc_devf +#define _TWED_FREE_DVC twed_free_dvcf #define _TWED_FREE_DEV twed_free_devf +#define _TWED_COPY_TO_DVC twed_copy_to_dvcf #define _TWED_COPY_TO_DEV twed_copy_to_devf #define _TWED_DEV twed_devf #define _TWED twedf @@ -148,8 +157,11 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col) #define _GEAM cublasSgeam #include "cuTWED_core.h" #undef REAL_t +#undef _TWED_MALLOC_DVC #undef _TWED_MALLOC_DEV +#undef _TWED_FREE_DVC #undef _TWED_FREE_DEV +#undef _TWED_COPY_TO_DVC #undef _TWED_COPY_TO_DEV #undef _TWED_DEV #undef _TWED diff --git a/src/cuTWED.h b/src/cuTWED.h index f5d0f34..a506f8a 100644 --- a/src/cuTWED.h +++ b/src/cuTWED.h @@ -56,10 +56,14 @@ extern "C" { /* Mallocs memory on device, approximately (6*nA + 6*nB) * sizeof(REAL_t) */ + void twed_malloc_dvc(const int nA, double **A_dev, double **TA_dev, + const int dim, const int nAA); void twed_malloc_dev(const int nA, double **A_dev, double **TA_dev, const int nB, double **B_dev, double **TB_dev, const int dim, const int nAA, const int nBB); + void twed_malloc_dvcf(const int nA, float **A_dev, float **TA_dev, + const int dim, const int nAA); void twed_malloc_devf(const int nA, float **A_dev, float **TA_dev, const int nB, float **B_dev, float **TB_dev, const int dim, const int nAA, const int nBB); @@ -69,9 +73,11 @@ extern "C" { /* Frees memory malloc'd in twed_malloc_dev */ + void twed_free_dvc(double *A_dev, double *TA_dev); void twed_free_dev(double *A_dev, double *TA_dev, double *B_dev, double *TB_dev); + void twed_free_dvcf(float *A_dev, float *TA_dev); void twed_free_devf(float *A_dev, float *TA_dev, float *B_dev, float *TB_dev); @@ -79,10 +85,14 @@ extern "C" { Copies data from host to device. You would only use this function if you are writing logic to reuse gpu memory. */ + void twed_copy_to_dvc(const int nA, double A[], double A_dev[], double TA[], double TA_dev[], + const int dim, const int nAA); void twed_copy_to_dev(const int nA, double A[], double A_dev[], double TA[], double TA_dev[], const int nB, double B[], double B_dev[], double TB[], double TB_dev[], const int dim, const int nAA, const int nBB); + void twed_copy_to_dvcf(const int nA, float A[], float A_dev[], float TA[], float TA_dev[], + const int dim, const int nAA); void twed_copy_to_devf(const int nA, float A[], float A_dev[], float TA[], float TA_dev[], const int nB, float B[], float B_dev[], float TB[], float TB_dev[], const int dim, const int nAA, const int nBB); diff --git a/src/cuTWED_core.h b/src/cuTWED_core.h index 851b9a1..c438a1c 100644 --- a/src/cuTWED_core.h +++ b/src/cuTWED_core.h @@ -367,40 +367,46 @@ static void grid_evalZ(const REAL_t* __restrict__ A, const REAL_t* __restrict__ extern "C" { #endif - void _TWED_MALLOC_DEV(const int nA, REAL_t **A_dev, REAL_t **TA_dev, - const int nB, REAL_t **B_dev, REAL_t **TB_dev, - const int dim, const int nAA, const int nBB){ - /* malloc on gpu and copy */ + void _TWED_MALLOC_DVC(const int nA, REAL_t **A_dev, REAL_t **TA_dev, + const int dim, const int nAA){ const size_t sza = nAA*(nA+1) * sizeof(**A_dev); HANDLE_ERROR(cudaMalloc(A_dev, sza*dim)); HANDLE_ERROR(cudaMalloc(TA_dev, sza)); + } - const size_t szb = nBB*(nB+1) * sizeof(**B_dev); - HANDLE_ERROR(cudaMalloc(B_dev, szb*dim)); - HANDLE_ERROR(cudaMalloc(TB_dev, szb)); + void _TWED_MALLOC_DEV(const int nA, REAL_t **A_dev, REAL_t **TA_dev, + const int nB, REAL_t **B_dev, REAL_t **TB_dev, + const int dim, const int nAA, const int nBB){ + /* malloc on gpu and copy */ + _TWED_MALLOC_DVC(nA, A_dev, TA_dev, dim, nAA); + _TWED_MALLOC_DVC(nB, B_dev, TB_dev, dim, nBB); } + void _TWED_FREE_DVC(REAL_t *A_dev, REAL_t *TA_dev) { + HANDLE_ERROR(cudaFree( A_dev)); + HANDLE_ERROR(cudaFree(TA_dev)); + } + void _TWED_FREE_DEV(REAL_t *A_dev, REAL_t *TA_dev, REAL_t *B_dev, REAL_t *TB_dev){ /* In a minute I'll be free */ - HANDLE_ERROR(cudaFree(A_dev)); - HANDLE_ERROR(cudaFree(TA_dev)); - HANDLE_ERROR(cudaFree(B_dev)); - HANDLE_ERROR(cudaFree(TB_dev)); + _TWED_FREE_DVC(A_dev, TA_dev); + _TWED_FREE_DVC(B_dev, TB_dev); } - - void _TWED_COPY_TO_DEV(const int nA, REAL_t A[], REAL_t A_dev[], REAL_t TA[], REAL_t TA_dev[], - const int nB, REAL_t B[], REAL_t B_dev[], REAL_t TB[], REAL_t TB_dev[], - const int dim, const int nAA, const int nBB){ + void _TWED_COPY_TO_DVC(const int nA, REAL_t A[], REAL_t A_dev[], REAL_t TA[], REAL_t TA_dev[], + const int dim, const int nAA){ const size_t sza = nAA*nA*sizeof(*A); HANDLE_ERROR(cudaMemcpy(A_dev, A, sza*dim, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(TA_dev, TA, sza, cudaMemcpyHostToDevice)); + } - const size_t szb = nBB*nB*sizeof(*B); - HANDLE_ERROR(cudaMemcpy(B_dev, B , szb*dim, cudaMemcpyHostToDevice)); - HANDLE_ERROR(cudaMemcpy(TB_dev, TB, szb, cudaMemcpyHostToDevice)); + void _TWED_COPY_TO_DEV(const int nA, REAL_t A[], REAL_t A_dev[], REAL_t TA[], REAL_t TA_dev[], + const int nB, REAL_t B[], REAL_t B_dev[], REAL_t TB[], REAL_t TB_dev[], + const int dim, const int nAA, const int nBB){ + _TWED_COPY_TO_DVC(nA, A, A_dev, TA, TA_dev, dim, nAA); + _TWED_COPY_TO_DVC(nB, B, B_dev, TB, TB_dev, dim, nBB); } From 7ccb1c7f42576e62a4f86af75d3fb8c9d19a7ab5 Mon Sep 17 00:00:00 2001 From: yilabs Date: Thu, 8 Sep 2022 17:27:20 -0400 Subject: [PATCH 2/2] add twed_malloc_dvc_time_steps and twed_malloc_dvc_time_series --- cuTWED/cuTWED.h.i | 8 ++++++++ src/cuTWED.cu | 8 ++++++++ src/cuTWED.h | 8 ++++++++ src/cuTWED_core.h | 17 ++++++++++++++--- 4 files changed, 38 insertions(+), 3 deletions(-) diff --git a/cuTWED/cuTWED.h.i b/cuTWED/cuTWED.h.i index 5c165db..4f0aee4 100644 --- a/cuTWED/cuTWED.h.i +++ b/cuTWED/cuTWED.h.i @@ -13,11 +13,19 @@ float twed_devf(float A_dev[], int nA, float TA_dev[], float B_dev[], int nB, float TB_dev[], float nu, float lambda, int degree, int dim); + void twed_malloc_dvc_time_steps(const int nA, double **TA_dev, + const int nAA); + void twed_malloc_dvc_time_series(const int nA, double **A_dev, + const int dim, const int nAA); void twed_malloc_dvc(const int nA, double **A_dev, double **TA_dev, const int dim, const int nAA); void twed_malloc_dev(const int nA, double **A_dev, double **TA_dev, const int nB, double **B_dev, double **TB_dev, const int dim, const int nAA, const int nBB); + void twed_malloc_dvc_time_stepsf(const int nA, float **TA_dev, + const int nAA); + void twed_malloc_dvc_time_seriesf(const int nA, float **A_dev, + const int dim, const int nAA); void twed_malloc_dvcf(const int nA, float **A_dev, float **TA_dev, const int dim, const int nAA); void twed_malloc_devf(const int nA, float **A_dev, float **TA_dev, diff --git a/src/cuTWED.cu b/src/cuTWED.cu index 4aba8a3..38a5834 100644 --- a/src/cuTWED.cu +++ b/src/cuTWED.cu @@ -118,6 +118,8 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col) */ #define REAL_t double +#define _TWED_MALLOC_DVC_TIME_STEPS twed_malloc_dvc_time_steps +#define _TWED_MALLOC_DVC_TIME_SERIES twed_malloc_dvc_time_series #define _TWED_MALLOC_DVC twed_malloc_dvc #define _TWED_MALLOC_DEV twed_malloc_dev #define _TWED_FREE_DVC twed_free_dvc @@ -131,6 +133,8 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col) #define _GEAM cublasDgeam #include "cuTWED_core.h" #undef REAL_t +#undef _TWED_MALLOC_DVC_TIME_STEPS +#undef _TWED_MALLOC_DVC_TIME_SERIES #undef _TWED_MALLOC_DVC #undef _TWED_MALLOC_DEV #undef _TWED_FREE_DVC @@ -144,6 +148,8 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col) #undef _GEAM #define REAL_t float +#define _TWED_MALLOC_DVC_TIME_STEPS twed_malloc_dvc_time_stepsf +#define _TWED_MALLOC_DVC_TIME_SERIES twed_malloc_dvc_time_seriesf #define _TWED_MALLOC_DVC twed_malloc_dvcf #define _TWED_MALLOC_DEV twed_malloc_devf #define _TWED_FREE_DVC twed_free_dvcf @@ -157,6 +163,8 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col) #define _GEAM cublasSgeam #include "cuTWED_core.h" #undef REAL_t +#undef _TWED_MALLOC_DVC_TIME_STEPS +#undef _TWED_MALLOC_DVC_TIME_SERIES #undef _TWED_MALLOC_DVC #undef _TWED_MALLOC_DEV #undef _TWED_FREE_DVC diff --git a/src/cuTWED.h b/src/cuTWED.h index a506f8a..69a8105 100644 --- a/src/cuTWED.h +++ b/src/cuTWED.h @@ -56,12 +56,20 @@ extern "C" { /* Mallocs memory on device, approximately (6*nA + 6*nB) * sizeof(REAL_t) */ + void twed_malloc_dvc_time_steps(const int nA, double **TA_dev, + const int nAA); + void twed_malloc_dvc_time_series(const int nA, double **A_dev, + const int dim, const int nAA); void twed_malloc_dvc(const int nA, double **A_dev, double **TA_dev, const int dim, const int nAA); void twed_malloc_dev(const int nA, double **A_dev, double **TA_dev, const int nB, double **B_dev, double **TB_dev, const int dim, const int nAA, const int nBB); + void twed_malloc_dvc_time_stepsf(const int nA, float **TA_dev, + const int nAA); + void twed_malloc_dvc_time_seriesf(const int nA, float **A_dev, + const int dim, const int nAA); void twed_malloc_dvcf(const int nA, float **A_dev, float **TA_dev, const int dim, const int nAA); void twed_malloc_devf(const int nA, float **A_dev, float **TA_dev, diff --git a/src/cuTWED_core.h b/src/cuTWED_core.h index c438a1c..9eb14ef 100644 --- a/src/cuTWED_core.h +++ b/src/cuTWED_core.h @@ -367,13 +367,24 @@ static void grid_evalZ(const REAL_t* __restrict__ A, const REAL_t* __restrict__ extern "C" { #endif - void _TWED_MALLOC_DVC(const int nA, REAL_t **A_dev, REAL_t **TA_dev, - const int dim, const int nAA){ - const size_t sza = nAA*(nA+1) * sizeof(**A_dev); + void _TWED_MALLOC_DVC_TIME_SERIES(const int nA, REAL_t **A_dev, + const int dim, const int nAA) { + const size_t sza = nAA*(nA+1) * sizeof(REAL_t); HANDLE_ERROR(cudaMalloc(A_dev, sza*dim)); + } + + void _TWED_MALLOC_DVC_TIME_STEPS(const int nA, REAL_t **TA_dev, + const int nAA){ + const size_t sza = nAA*(nA+1) * sizeof(REAL_t); HANDLE_ERROR(cudaMalloc(TA_dev, sza)); } + void _TWED_MALLOC_DVC(const int nA, REAL_t **A_dev, REAL_t **TA_dev, + const int dim, const int nAA){ + _TWED_MALLOC_DVC_TIME_SERIES(nA, A_dev, dim, nAA); + _TWED_MALLOC_DVC_TIME_STEPS(nA, TA_dev, nAA); + } + void _TWED_MALLOC_DEV(const int nA, REAL_t **A_dev, REAL_t **TA_dev, const int nB, REAL_t **B_dev, REAL_t **TB_dev, const int dim, const int nAA, const int nBB){