From bdf253790286fcb32f11dc1b6ad3a3c40bf76667 Mon Sep 17 00:00:00 2001
From: yilabs <yilabs.com@gmail.com>
Date: Wed, 24 Aug 2022 12:58:17 -0400
Subject: [PATCH 1/2] add twed_malloc_dvc, twed_free_dvc, twed_copy_to_dvc

---
 cuTWED/cuTWED.h.i | 10 ++++++++++
 src/cuTWED.cu     | 12 ++++++++++++
 src/cuTWED.h      | 10 ++++++++++
 src/cuTWED_core.h | 42 ++++++++++++++++++++++++------------------
 4 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/cuTWED/cuTWED.h.i b/cuTWED/cuTWED.h.i
index 9e0968c..5c165db 100644
--- a/cuTWED/cuTWED.h.i
+++ b/cuTWED/cuTWED.h.i
@@ -13,19 +13,29 @@
   float twed_devf(float A_dev[], int nA, float TA_dev[],
                   float B_dev[], int nB, float TB_dev[],
                   float nu, float lambda, int degree, int dim);
+  void twed_malloc_dvc(const int nA, double **A_dev, double **TA_dev,
+                       const int dim, const int nAA);
   void twed_malloc_dev(const int nA, double **A_dev, double **TA_dev,
                        const int nB, double **B_dev, double **TB_dev,
                        const int dim, const int nAA, const int nBB);
+  void twed_malloc_dvcf(const int nA, float **A_dev, float **TA_dev,
+                        const int dim, const int nAA);
   void twed_malloc_devf(const int nA, float **A_dev, float **TA_dev,
                         const int nB, float **B_dev, float **TB_dev,
                         const int dim, const int nAA, const int nBB);
+  void twed_free_dvc(double *A_dev, double *TA_dev);
   void twed_free_dev(double *A_dev, double *TA_dev,
                      double *B_dev, double *TB_dev);
+  void twed_free_dvcf(float *A_dev, float *TA_dev);
   void twed_free_devf(float *A_dev, float *TA_dev,
                       float *B_dev, float *TB_dev);
+  void twed_copy_to_dvc(const int nA, double A[], double A_dev[], double TA[], double TA_dev[],
+                        const int dim, const int nAA);
   void twed_copy_to_dev(const int nA, double A[], double A_dev[], double TA[], double TA_dev[],
                         const int nB, double B[], double B_dev[], double TB[], double TB_dev[],
                         const int dim, const int nAA, const int nBB);
+  void twed_copy_to_dvcf(const int nA, float A[], float A_dev[], float TA[], float TA_dev[],
+                         const int dim, const int nAA);
   void twed_copy_to_devf(const int nA, float A[], float A_dev[], float TA[], float TA_dev[],
                          const int nB, float B[], float B_dev[], float TB[], float TB_dev[],
                          const int dim, const int nAA, const int nBB);
diff --git a/src/cuTWED.cu b/src/cuTWED.cu
index dd9a05b..4aba8a3 100644
--- a/src/cuTWED.cu
+++ b/src/cuTWED.cu
@@ -118,8 +118,11 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col)
 */
 
 #define REAL_t double
+#define _TWED_MALLOC_DVC twed_malloc_dvc
 #define _TWED_MALLOC_DEV twed_malloc_dev
+#define _TWED_FREE_DVC twed_free_dvc
 #define _TWED_FREE_DEV twed_free_dev
+#define _TWED_COPY_TO_DVC twed_copy_to_dvc
 #define _TWED_COPY_TO_DEV twed_copy_to_dev
 #define _TWED_DEV twed_dev
 #define _TWED twed
@@ -128,8 +131,11 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col)
 #define _GEAM cublasDgeam
 #include "cuTWED_core.h"
 #undef REAL_t
+#undef _TWED_MALLOC_DVC
 #undef _TWED_MALLOC_DEV
+#undef _TWED_FREE_DVC
 #undef _TWED_FREE_DEV
+#undef _TWED_COPY_TO_DVC
 #undef _TWED_COPY_TO_DEV
 #undef _TWED_DEV
 #undef _TWED
@@ -138,8 +144,11 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col)
 #undef _GEAM
 
 #define REAL_t float
+#define _TWED_MALLOC_DVC twed_malloc_dvcf
 #define _TWED_MALLOC_DEV twed_malloc_devf
+#define _TWED_FREE_DVC twed_free_dvcf
 #define _TWED_FREE_DEV twed_free_devf
+#define _TWED_COPY_TO_DVC twed_copy_to_dvcf
 #define _TWED_COPY_TO_DEV twed_copy_to_devf
 #define _TWED_DEV twed_devf
 #define _TWED twedf
@@ -148,8 +157,11 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col)
 #define _GEAM cublasSgeam
 #include "cuTWED_core.h"
 #undef REAL_t
+#undef _TWED_MALLOC_DVC
 #undef _TWED_MALLOC_DEV
+#undef _TWED_FREE_DVC
 #undef _TWED_FREE_DEV
+#undef _TWED_COPY_TO_DVC
 #undef _TWED_COPY_TO_DEV
 #undef _TWED_DEV
 #undef _TWED
diff --git a/src/cuTWED.h b/src/cuTWED.h
index f5d0f34..a506f8a 100644
--- a/src/cuTWED.h
+++ b/src/cuTWED.h
@@ -56,10 +56,14 @@ extern "C" {
   /*
     Mallocs memory on device, approximately (6*nA + 6*nB) * sizeof(REAL_t)
   */
+  void twed_malloc_dvc(const int nA, double **A_dev, double  **TA_dev,
+                       const int dim, const int nAA);
   void twed_malloc_dev(const int nA, double **A_dev, double  **TA_dev,
                        const int nB, double **B_dev, double  **TB_dev,
                        const int dim, const int nAA, const int nBB);
 
+  void twed_malloc_dvcf(const int nA, float **A_dev, float  **TA_dev,
+                        const int dim, const int nAA);
   void twed_malloc_devf(const int nA, float **A_dev, float  **TA_dev,
                         const int nB, float **B_dev, float  **TB_dev,
                         const int dim, const int nAA, const int nBB);
@@ -69,9 +73,11 @@ extern "C" {
   /*
     Frees memory malloc'd in twed_malloc_dev
   */
+  void twed_free_dvc(double *A_dev, double  *TA_dev);
   void twed_free_dev(double *A_dev, double  *TA_dev,
                      double *B_dev, double  *TB_dev);
 
+  void twed_free_dvcf(float *A_dev, float  *TA_dev);
   void twed_free_devf(float *A_dev, float  *TA_dev,
                       float *B_dev, float  *TB_dev);
 
@@ -79,10 +85,14 @@ extern "C" {
     Copies data from host to device. You would only use this function if you
     are writing logic to reuse gpu memory.
   */
+  void twed_copy_to_dvc(const int nA, double A[], double A_dev[], double TA[], double TA_dev[],
+                        const int dim, const int nAA);
   void twed_copy_to_dev(const int nA, double A[], double A_dev[], double TA[], double TA_dev[],
                         const int nB, double B[], double B_dev[], double TB[], double TB_dev[],
                         const int dim, const int nAA, const int nBB);
 
+  void twed_copy_to_dvcf(const int nA, float A[], float A_dev[], float TA[], float TA_dev[],
+                         const int dim, const int nAA);
   void twed_copy_to_devf(const int nA, float A[], float A_dev[], float TA[], float TA_dev[],
                          const int nB, float B[], float B_dev[], float TB[], float TB_dev[],
                          const int dim, const int nAA, const int nBB);
diff --git a/src/cuTWED_core.h b/src/cuTWED_core.h
index 851b9a1..c438a1c 100644
--- a/src/cuTWED_core.h
+++ b/src/cuTWED_core.h
@@ -367,40 +367,46 @@ static void grid_evalZ(const REAL_t* __restrict__ A, const REAL_t* __restrict__
 extern "C" {
 #endif
 
-  void _TWED_MALLOC_DEV(const int nA, REAL_t **A_dev, REAL_t  **TA_dev,
-                        const int nB, REAL_t **B_dev, REAL_t  **TB_dev,
-                        const int dim, const int nAA, const int nBB){
-    /* malloc on gpu and copy */
+  void _TWED_MALLOC_DVC(const int nA, REAL_t **A_dev, REAL_t  **TA_dev,
+                        const int dim, const int nAA){
     const size_t sza = nAA*(nA+1) * sizeof(**A_dev);
     HANDLE_ERROR(cudaMalloc(A_dev, sza*dim));
     HANDLE_ERROR(cudaMalloc(TA_dev, sza));
+  }
 
-    const size_t szb = nBB*(nB+1) * sizeof(**B_dev);
-    HANDLE_ERROR(cudaMalloc(B_dev, szb*dim));
-    HANDLE_ERROR(cudaMalloc(TB_dev, szb));
+  void _TWED_MALLOC_DEV(const int nA, REAL_t **A_dev, REAL_t  **TA_dev,
+                        const int nB, REAL_t **B_dev, REAL_t  **TB_dev,
+                        const int dim, const int nAA, const int nBB){
+    /* malloc on gpu and copy */
+    _TWED_MALLOC_DVC(nA, A_dev, TA_dev, dim, nAA);
+    _TWED_MALLOC_DVC(nB, B_dev, TB_dev, dim, nBB);
   }
 
 
+  void _TWED_FREE_DVC(REAL_t *A_dev, REAL_t  *TA_dev) {
+    HANDLE_ERROR(cudaFree( A_dev));
+    HANDLE_ERROR(cudaFree(TA_dev));
+  }
+
   void _TWED_FREE_DEV(REAL_t *A_dev, REAL_t  *TA_dev,
                       REAL_t *B_dev, REAL_t  *TB_dev){
     /* In a minute I'll be free */
-    HANDLE_ERROR(cudaFree(A_dev));
-    HANDLE_ERROR(cudaFree(TA_dev));
-    HANDLE_ERROR(cudaFree(B_dev));
-    HANDLE_ERROR(cudaFree(TB_dev));
+    _TWED_FREE_DVC(A_dev, TA_dev);
+    _TWED_FREE_DVC(B_dev, TB_dev);
   }
 
-
-  void _TWED_COPY_TO_DEV(const int nA, REAL_t A[], REAL_t A_dev[], REAL_t TA[], REAL_t TA_dev[],
-                         const int nB, REAL_t B[], REAL_t B_dev[], REAL_t TB[], REAL_t TB_dev[],
-                         const int dim, const int nAA, const int nBB){
+  void _TWED_COPY_TO_DVC(const int nA, REAL_t A[], REAL_t A_dev[], REAL_t TA[], REAL_t TA_dev[],
+                         const int dim, const int nAA){
     const size_t sza = nAA*nA*sizeof(*A);
     HANDLE_ERROR(cudaMemcpy(A_dev, A, sza*dim, cudaMemcpyHostToDevice));
     HANDLE_ERROR(cudaMemcpy(TA_dev, TA, sza, cudaMemcpyHostToDevice));
+  }
 
-    const size_t szb = nBB*nB*sizeof(*B);
-    HANDLE_ERROR(cudaMemcpy(B_dev, B , szb*dim, cudaMemcpyHostToDevice));
-    HANDLE_ERROR(cudaMemcpy(TB_dev, TB, szb, cudaMemcpyHostToDevice));
+  void _TWED_COPY_TO_DEV(const int nA, REAL_t A[], REAL_t A_dev[], REAL_t TA[], REAL_t TA_dev[],
+                         const int nB, REAL_t B[], REAL_t B_dev[], REAL_t TB[], REAL_t TB_dev[],
+                         const int dim, const int nAA, const int nBB){
+    _TWED_COPY_TO_DVC(nA, A, A_dev, TA, TA_dev, dim, nAA);
+    _TWED_COPY_TO_DVC(nB, B, B_dev, TB, TB_dev, dim, nBB);
   }
 
 

From 7ccb1c7f42576e62a4f86af75d3fb8c9d19a7ab5 Mon Sep 17 00:00:00 2001
From: yilabs <yilabs.com@gmail.com>
Date: Thu, 8 Sep 2022 17:27:20 -0400
Subject: [PATCH 2/2] add twed_malloc_dvc_time_steps and
 twed_malloc_dvc_time_series

---
 cuTWED/cuTWED.h.i |  8 ++++++++
 src/cuTWED.cu     |  8 ++++++++
 src/cuTWED.h      |  8 ++++++++
 src/cuTWED_core.h | 17 ++++++++++++++---
 4 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/cuTWED/cuTWED.h.i b/cuTWED/cuTWED.h.i
index 5c165db..4f0aee4 100644
--- a/cuTWED/cuTWED.h.i
+++ b/cuTWED/cuTWED.h.i
@@ -13,11 +13,19 @@
   float twed_devf(float A_dev[], int nA, float TA_dev[],
                   float B_dev[], int nB, float TB_dev[],
                   float nu, float lambda, int degree, int dim);
+  void twed_malloc_dvc_time_steps(const int nA, double **TA_dev,
+                       const int nAA);
+  void twed_malloc_dvc_time_series(const int nA, double **A_dev,
+                       const int dim, const int nAA);
   void twed_malloc_dvc(const int nA, double **A_dev, double **TA_dev,
                        const int dim, const int nAA);
   void twed_malloc_dev(const int nA, double **A_dev, double **TA_dev,
                        const int nB, double **B_dev, double **TB_dev,
                        const int dim, const int nAA, const int nBB);
+  void twed_malloc_dvc_time_stepsf(const int nA, float **TA_dev,
+                        const int nAA);
+  void twed_malloc_dvc_time_seriesf(const int nA, float **A_dev,
+                        const int dim, const int nAA);
   void twed_malloc_dvcf(const int nA, float **A_dev, float **TA_dev,
                         const int dim, const int nAA);
   void twed_malloc_devf(const int nA, float **A_dev, float **TA_dev,
diff --git a/src/cuTWED.cu b/src/cuTWED.cu
index 4aba8a3..38a5834 100644
--- a/src/cuTWED.cu
+++ b/src/cuTWED.cu
@@ -118,6 +118,8 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col)
 */
 
 #define REAL_t double
+#define _TWED_MALLOC_DVC_TIME_STEPS  twed_malloc_dvc_time_steps
+#define _TWED_MALLOC_DVC_TIME_SERIES twed_malloc_dvc_time_series
 #define _TWED_MALLOC_DVC twed_malloc_dvc
 #define _TWED_MALLOC_DEV twed_malloc_dev
 #define _TWED_FREE_DVC twed_free_dvc
@@ -131,6 +133,8 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col)
 #define _GEAM cublasDgeam
 #include "cuTWED_core.h"
 #undef REAL_t
+#undef _TWED_MALLOC_DVC_TIME_STEPS
+#undef _TWED_MALLOC_DVC_TIME_SERIES
 #undef _TWED_MALLOC_DVC
 #undef _TWED_MALLOC_DEV
 #undef _TWED_FREE_DVC
@@ -144,6 +148,8 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col)
 #undef _GEAM
 
 #define REAL_t float
+#define _TWED_MALLOC_DVC_TIME_STEPS  twed_malloc_dvc_time_stepsf
+#define _TWED_MALLOC_DVC_TIME_SERIES twed_malloc_dvc_time_seriesf
 #define _TWED_MALLOC_DVC twed_malloc_dvcf
 #define _TWED_MALLOC_DEV twed_malloc_devf
 #define _TWED_FREE_DVC twed_free_dvcf
@@ -157,6 +163,8 @@ static __inline__ __host__ __device__ diagIdx_t map_rc_to_diag(int row, int col)
 #define _GEAM cublasSgeam
 #include "cuTWED_core.h"
 #undef REAL_t
+#undef _TWED_MALLOC_DVC_TIME_STEPS
+#undef _TWED_MALLOC_DVC_TIME_SERIES
 #undef _TWED_MALLOC_DVC
 #undef _TWED_MALLOC_DEV
 #undef _TWED_FREE_DVC
diff --git a/src/cuTWED.h b/src/cuTWED.h
index a506f8a..69a8105 100644
--- a/src/cuTWED.h
+++ b/src/cuTWED.h
@@ -56,12 +56,20 @@ extern "C" {
   /*
     Mallocs memory on device, approximately (6*nA + 6*nB) * sizeof(REAL_t)
   */
+  void twed_malloc_dvc_time_steps(const int nA, double **TA_dev,
+                       const int nAA);
+  void twed_malloc_dvc_time_series(const int nA, double **A_dev,
+                       const int dim, const int nAA);
   void twed_malloc_dvc(const int nA, double **A_dev, double  **TA_dev,
                        const int dim, const int nAA);
   void twed_malloc_dev(const int nA, double **A_dev, double  **TA_dev,
                        const int nB, double **B_dev, double  **TB_dev,
                        const int dim, const int nAA, const int nBB);
 
+  void twed_malloc_dvc_time_stepsf(const int nA, float **TA_dev,
+                        const int nAA);
+  void twed_malloc_dvc_time_seriesf(const int nA, float **A_dev,
+                        const int dim, const int nAA);
   void twed_malloc_dvcf(const int nA, float **A_dev, float  **TA_dev,
                         const int dim, const int nAA);
   void twed_malloc_devf(const int nA, float **A_dev, float  **TA_dev,
diff --git a/src/cuTWED_core.h b/src/cuTWED_core.h
index c438a1c..9eb14ef 100644
--- a/src/cuTWED_core.h
+++ b/src/cuTWED_core.h
@@ -367,13 +367,24 @@ static void grid_evalZ(const REAL_t* __restrict__ A, const REAL_t* __restrict__
 extern "C" {
 #endif
 
-  void _TWED_MALLOC_DVC(const int nA, REAL_t **A_dev, REAL_t  **TA_dev,
-                        const int dim, const int nAA){
-    const size_t sza = nAA*(nA+1) * sizeof(**A_dev);
+  void _TWED_MALLOC_DVC_TIME_SERIES(const int nA, REAL_t **A_dev,
+                        const int dim, const int nAA) {
+    const size_t sza = nAA*(nA+1) * sizeof(REAL_t);
     HANDLE_ERROR(cudaMalloc(A_dev, sza*dim));
+  }
+
+  void _TWED_MALLOC_DVC_TIME_STEPS(const int nA, REAL_t  **TA_dev,
+                        const int nAA){
+    const size_t sza = nAA*(nA+1) * sizeof(REAL_t);
     HANDLE_ERROR(cudaMalloc(TA_dev, sza));
   }
 
+  void _TWED_MALLOC_DVC(const int nA, REAL_t **A_dev, REAL_t  **TA_dev,
+                        const int dim, const int nAA){
+    _TWED_MALLOC_DVC_TIME_SERIES(nA, A_dev, dim, nAA);
+    _TWED_MALLOC_DVC_TIME_STEPS(nA, TA_dev, nAA);
+  }
+
   void _TWED_MALLOC_DEV(const int nA, REAL_t **A_dev, REAL_t  **TA_dev,
                         const int nB, REAL_t **B_dev, REAL_t  **TB_dev,
                         const int dim, const int nAA, const int nBB){