diff --git a/cuTWED/cuTWED.h.i b/cuTWED/cuTWED.h.i index 9e0968c..4f0aee4 100644 --- a/cuTWED/cuTWED.h.i +++ b/cuTWED/cuTWED.h.i @@ -13,19 +13,37 @@ float twed_devf(float A_dev[], int nA, float TA_dev[], float B_dev[], int nB, float TB_dev[], float nu, float lambda, int degree, int dim); + void twed_malloc_dvc_time_steps(const int nA, double **TA_dev, + const int nAA); + void twed_malloc_dvc_time_series(const int nA, double **A_dev, + const int dim, const int nAA); + void twed_malloc_dvc(const int nA, double **A_dev, double **TA_dev, + const int dim, const int nAA); void twed_malloc_dev(const int nA, double **A_dev, double **TA_dev, const int nB, double **B_dev, double **TB_dev, const int dim, const int nAA, const int nBB); + void twed_malloc_dvc_time_stepsf(const int nA, float **TA_dev, + const int nAA); + void twed_malloc_dvc_time_seriesf(const int nA, float **A_dev, + const int dim, const int nAA); + void twed_malloc_dvcf(const int nA, float **A_dev, float **TA_dev, + const int dim, const int nAA); void twed_malloc_devf(const int nA, float **A_dev, float **TA_dev, const int nB, float **B_dev, float **TB_dev, const int dim, const int nAA, const int nBB); + void twed_free_dvc(double *A_dev, double *TA_dev); void twed_free_dev(double *A_dev, double *TA_dev, double *B_dev, double *TB_dev); + void twed_free_dvcf(float *A_dev, float *TA_dev); void twed_free_devf(float *A_dev, float *TA_dev, float *B_dev, float *TB_dev); + void twed_copy_to_dvc(const int nA, double A[], double A_dev[], double TA[], double TA_dev[], + const int dim, const int nAA); void twed_copy_to_dev(const int nA, double A[], double A_dev[], double TA[], double TA_dev[], const int nB, double B[], double B_dev[], double TB[], double TB_dev[], const int dim, const int nAA, const int nBB); + void twed_copy_to_dvcf(const int nA, float A[], float A_dev[], float TA[], float TA_dev[], + const int dim, const int nAA); void twed_copy_to_devf(const int nA, float A[], float A_dev[], float TA[], float TA_dev[], const int nB, float B[], float B_dev[], float TB[], float TB_dev[], const int dim, const int nAA, const int nBB); diff --git a/src/cuTWED.cu b/src/cuTWED.cu index dd9a05b..38a5834 100644 --- a/src/cuTWED.cu +++ b/src/cuTWED.cu @@ -118,8 +118,13 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col) */ #define REAL_t double +#define _TWED_MALLOC_DVC_TIME_STEPS twed_malloc_dvc_time_steps +#define _TWED_MALLOC_DVC_TIME_SERIES twed_malloc_dvc_time_series +#define _TWED_MALLOC_DVC twed_malloc_dvc #define _TWED_MALLOC_DEV twed_malloc_dev +#define _TWED_FREE_DVC twed_free_dvc #define _TWED_FREE_DEV twed_free_dev +#define _TWED_COPY_TO_DVC twed_copy_to_dvc #define _TWED_COPY_TO_DEV twed_copy_to_dev #define _TWED_DEV twed_dev #define _TWED twed @@ -128,8 +133,13 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col) #define _GEAM cublasDgeam #include "cuTWED_core.h" #undef REAL_t +#undef _TWED_MALLOC_DVC_TIME_STEPS +#undef _TWED_MALLOC_DVC_TIME_SERIES +#undef _TWED_MALLOC_DVC #undef _TWED_MALLOC_DEV +#undef _TWED_FREE_DVC #undef _TWED_FREE_DEV +#undef _TWED_COPY_TO_DVC #undef _TWED_COPY_TO_DEV #undef _TWED_DEV #undef _TWED @@ -138,8 +148,13 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col) #undef _GEAM #define REAL_t float +#define _TWED_MALLOC_DVC_TIME_STEPS twed_malloc_dvc_time_stepsf +#define _TWED_MALLOC_DVC_TIME_SERIES twed_malloc_dvc_time_seriesf +#define _TWED_MALLOC_DVC twed_malloc_dvcf #define _TWED_MALLOC_DEV twed_malloc_devf +#define _TWED_FREE_DVC twed_free_dvcf #define _TWED_FREE_DEV twed_free_devf +#define _TWED_COPY_TO_DVC twed_copy_to_dvcf #define _TWED_COPY_TO_DEV twed_copy_to_devf #define _TWED_DEV twed_devf #define _TWED twedf @@ -148,8 +163,13 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col) #define _GEAM cublasSgeam #include "cuTWED_core.h" #undef REAL_t +#undef _TWED_MALLOC_DVC_TIME_STEPS +#undef _TWED_MALLOC_DVC_TIME_SERIES +#undef _TWED_MALLOC_DVC #undef _TWED_MALLOC_DEV +#undef _TWED_FREE_DVC #undef _TWED_FREE_DEV +#undef _TWED_COPY_TO_DVC #undef _TWED_COPY_TO_DEV #undef _TWED_DEV #undef _TWED diff --git a/src/cuTWED.h b/src/cuTWED.h index f5d0f34..69a8105 100644 --- a/src/cuTWED.h +++ b/src/cuTWED.h @@ -56,10 +56,22 @@ extern "C" { /* Mallocs memory on device, approximately (6*nA + 6*nB) * sizeof(REAL_t) */ + void twed_malloc_dvc_time_steps(const int nA, double **TA_dev, + const int nAA); + void twed_malloc_dvc_time_series(const int nA, double **A_dev, + const int dim, const int nAA); + void twed_malloc_dvc(const int nA, double **A_dev, double **TA_dev, + const int dim, const int nAA); void twed_malloc_dev(const int nA, double **A_dev, double **TA_dev, const int nB, double **B_dev, double **TB_dev, const int dim, const int nAA, const int nBB); + void twed_malloc_dvc_time_stepsf(const int nA, float **TA_dev, + const int nAA); + void twed_malloc_dvc_time_seriesf(const int nA, float **A_dev, + const int dim, const int nAA); + void twed_malloc_dvcf(const int nA, float **A_dev, float **TA_dev, + const int dim, const int nAA); void twed_malloc_devf(const int nA, float **A_dev, float **TA_dev, const int nB, float **B_dev, float **TB_dev, const int dim, const int nAA, const int nBB); @@ -69,9 +81,11 @@ extern "C" { /* Frees memory malloc'd in twed_malloc_dev */ + void twed_free_dvc(double *A_dev, double *TA_dev); void twed_free_dev(double *A_dev, double *TA_dev, double *B_dev, double *TB_dev); + void twed_free_dvcf(float *A_dev, float *TA_dev); void twed_free_devf(float *A_dev, float *TA_dev, float *B_dev, float *TB_dev); @@ -79,10 +93,14 @@ extern "C" { Copies data from host to device. You would only use this function if you are writing logic to reuse gpu memory. */ + void twed_copy_to_dvc(const int nA, double A[], double A_dev[], double TA[], double TA_dev[], + const int dim, const int nAA); void twed_copy_to_dev(const int nA, double A[], double A_dev[], double TA[], double TA_dev[], const int nB, double B[], double B_dev[], double TB[], double TB_dev[], const int dim, const int nAA, const int nBB); + void twed_copy_to_dvcf(const int nA, float A[], float A_dev[], float TA[], float TA_dev[], + const int dim, const int nAA); void twed_copy_to_devf(const int nA, float A[], float A_dev[], float TA[], float TA_dev[], const int nB, float B[], float B_dev[], float TB[], float TB_dev[], const int dim, const int nAA, const int nBB); diff --git a/src/cuTWED_core.h b/src/cuTWED_core.h index 851b9a1..9eb14ef 100644 --- a/src/cuTWED_core.h +++ b/src/cuTWED_core.h @@ -367,40 +367,57 @@ static void grid_evalZ(const REAL_t* __restrict__ A, const REAL_t* __restrict__ extern "C" { #endif + void _TWED_MALLOC_DVC_TIME_SERIES(const int nA, REAL_t **A_dev, + const int dim, const int nAA) { + const size_t sza = nAA*(nA+1) * sizeof(REAL_t); + HANDLE_ERROR(cudaMalloc(A_dev, sza*dim)); + } + + void _TWED_MALLOC_DVC_TIME_STEPS(const int nA, REAL_t **TA_dev, + const int nAA){ + const size_t sza = nAA*(nA+1) * sizeof(REAL_t); + HANDLE_ERROR(cudaMalloc(TA_dev, sza)); + } + + void _TWED_MALLOC_DVC(const int nA, REAL_t **A_dev, REAL_t **TA_dev, + const int dim, const int nAA){ + _TWED_MALLOC_DVC_TIME_SERIES(nA, A_dev, dim, nAA); + _TWED_MALLOC_DVC_TIME_STEPS(nA, TA_dev, nAA); + } + void _TWED_MALLOC_DEV(const int nA, REAL_t **A_dev, REAL_t **TA_dev, const int nB, REAL_t **B_dev, REAL_t **TB_dev, const int dim, const int nAA, const int nBB){ /* malloc on gpu and copy */ - const size_t sza = nAA*(nA+1) * sizeof(**A_dev); - HANDLE_ERROR(cudaMalloc(A_dev, sza*dim)); - HANDLE_ERROR(cudaMalloc(TA_dev, sza)); - - const size_t szb = nBB*(nB+1) * sizeof(**B_dev); - HANDLE_ERROR(cudaMalloc(B_dev, szb*dim)); - HANDLE_ERROR(cudaMalloc(TB_dev, szb)); + _TWED_MALLOC_DVC(nA, A_dev, TA_dev, dim, nAA); + _TWED_MALLOC_DVC(nB, B_dev, TB_dev, dim, nBB); } + void _TWED_FREE_DVC(REAL_t *A_dev, REAL_t *TA_dev) { + HANDLE_ERROR(cudaFree( A_dev)); + HANDLE_ERROR(cudaFree(TA_dev)); + } + void _TWED_FREE_DEV(REAL_t *A_dev, REAL_t *TA_dev, REAL_t *B_dev, REAL_t *TB_dev){ /* In a minute I'll be free */ - HANDLE_ERROR(cudaFree(A_dev)); - HANDLE_ERROR(cudaFree(TA_dev)); - HANDLE_ERROR(cudaFree(B_dev)); - HANDLE_ERROR(cudaFree(TB_dev)); + _TWED_FREE_DVC(A_dev, TA_dev); + _TWED_FREE_DVC(B_dev, TB_dev); } - - void _TWED_COPY_TO_DEV(const int nA, REAL_t A[], REAL_t A_dev[], REAL_t TA[], REAL_t TA_dev[], - const int nB, REAL_t B[], REAL_t B_dev[], REAL_t TB[], REAL_t TB_dev[], - const int dim, const int nAA, const int nBB){ + void _TWED_COPY_TO_DVC(const int nA, REAL_t A[], REAL_t A_dev[], REAL_t TA[], REAL_t TA_dev[], + const int dim, const int nAA){ const size_t sza = nAA*nA*sizeof(*A); HANDLE_ERROR(cudaMemcpy(A_dev, A, sza*dim, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(TA_dev, TA, sza, cudaMemcpyHostToDevice)); + } - const size_t szb = nBB*nB*sizeof(*B); - HANDLE_ERROR(cudaMemcpy(B_dev, B , szb*dim, cudaMemcpyHostToDevice)); - HANDLE_ERROR(cudaMemcpy(TB_dev, TB, szb, cudaMemcpyHostToDevice)); + void _TWED_COPY_TO_DEV(const int nA, REAL_t A[], REAL_t A_dev[], REAL_t TA[], REAL_t TA_dev[], + const int nB, REAL_t B[], REAL_t B_dev[], REAL_t TB[], REAL_t TB_dev[], + const int dim, const int nAA, const int nBB){ + _TWED_COPY_TO_DVC(nA, A, A_dev, TA, TA_dev, dim, nAA); + _TWED_COPY_TO_DVC(nB, B, B_dev, TB, TB_dev, dim, nBB); }