diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index 9320af9224..cfc8a893be 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -1475,7 +1475,7 @@ endif ()
       "#define L2_ASSOCIATIVE 4\n")
     set(SGEMM_UNROLL_M 16)
     set(SGEMM_UNROLL_N 8)
-    set(DGEMM_UNROLL_M 8)
+    set(DGEMM_UNROLL_M 4)
     set(DGEMM_UNROLL_N 8)
     set(CGEMM_UNROLL_M 8)
     set(CGEMM_UNROLL_N 8)
diff --git a/getarch_2nd.c b/getarch_2nd.c
index 2085556bd6..744ad5ee7d 100644
--- a/getarch_2nd.c
+++ b/getarch_2nd.c
@@ -103,7 +103,55 @@ int main(int argc, char **argv) {
     printf("XGEMM3M_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N);
 #endif
 
+#ifdef ARCH_RISCV64
+#ifdef STRMM_DEFAULT_UNROLL_M
+    printf("STRMM_UNROLL_M=%d\n", STRMM_DEFAULT_UNROLL_M);
+#else
+    printf("STRMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
+#endif
+
+#ifdef STRMM_DEFAULT_UNROLL_N
+    printf("STRMM_UNROLL_N=%d\n", STRMM_DEFAULT_UNROLL_N);
+#else
+    printf("STRMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
+#endif
+
+#ifdef DTRMM_DEFAULT_UNROLL_M
+    printf("DTRMM_UNROLL_M=%d\n", DTRMM_DEFAULT_UNROLL_M);
+#else
+    printf("DTRMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
+#endif
+
+#ifdef DTRMM_DEFAULT_UNROLL_N
+    printf("DTRMM_UNROLL_N=%d\n", DTRMM_DEFAULT_UNROLL_N);
+#else
+    printf("DTRMM_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N);
+#endif
 
+#ifdef CTRMM_DEFAULT_UNROLL_M
+    printf("CTRMM_UNROLL_M=%d\n", CTRMM_DEFAULT_UNROLL_M);
+#else
+    printf("CTRMM_UNROLL_M=%d\n", CGEMM_DEFAULT_UNROLL_M);
+#endif
+
+#ifdef CTRMM_DEFAULT_UNROLL_N
+    printf("CTRMM_UNROLL_N=%d\n", CTRMM_DEFAULT_UNROLL_N);
+#else
+    printf("CTRMM_UNROLL_N=%d\n", CGEMM_DEFAULT_UNROLL_N);
+#endif
+
+#ifdef ZTRMM_DEFAULT_UNROLL_M
+    printf("ZTRMM_UNROLL_M=%d\n", ZTRMM_DEFAULT_UNROLL_M);
+#else
+    printf("ZTRMM_UNROLL_M=%d\n", ZGEMM_DEFAULT_UNROLL_M);
+#endif
+
+#ifdef ZTRMM_DEFAULT_UNROLL_N
+    printf("ZTRMM_UNROLL_N=%d\n", ZTRMM_DEFAULT_UNROLL_N);
+#else
+    printf("ZTRMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N);
+#endif
+#endif /* ARCH_RISCV64 */
   }
 
 
diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B
index ad7db5622e..038d58e8f5 100644
--- a/kernel/riscv64/KERNEL.RISCV64_ZVL128B
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B
@@ -95,10 +95,10 @@ DGEMVTKERNEL = gemv_t_rvv.c
 CGEMVTKERNEL = zgemv_t_rvv.c
 ZGEMVTKERNEL = zgemv_t_rvv.c
 
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c
+SGEMMKERNEL    =  sgemm_kernel_zvl128b.c
 ifneq ($(filter $(SGEMM_UNROLL_N),4 8 16),)
 SGEMMONCOPY    =  gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c
-SGEMMOTCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c
+SGEMMOTCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_N)_rvv_max.c
 else
 SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
@@ -109,7 +109,7 @@ SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 ifneq ($(filter $(SGEMM_UNROLL_M),4 8 16),)
 SGEMMINCOPY    =  gemm_ncopy_$(SGEMM_UNROLL_M)_rvv.c
-SGEMMITCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_M)_rvv.c
+SGEMMITCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_M)_rvv_max.c
 else
 SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
@@ -118,10 +118,10 @@ SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c
+DGEMMKERNEL    =  dgemm_kernel_zvl128b.c
 DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
 ifneq ($(filter $(DGEMM_UNROLL_N),4 8 16),)
-DGEMMOTCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c
+DGEMMOTCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_N)_rvv_max.c
 else
 DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
 endif
@@ -131,7 +131,7 @@ DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
 DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
 ifneq ($(filter $(DGEMM_UNROLL_M),4 8 16),)
-DGEMMITCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_M)_rvv.c
+DGEMMITCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_M)_rvv_max.c
 else
 DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
 endif
@@ -139,55 +139,55 @@ DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
 DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c
+CGEMMKERNEL    =  cgemm_kernel_zvl128b.c
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  zgemm_tcopy_$(CGEMM_UNROLL_N)_rvv_max.c
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
 CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  zgemm_tcopy_$(CGEMM_UNROLL_M)_rvv_max.c
 CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c
+ZGEMMKERNEL    =  zgemm_kernel_zvl128b.c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  zgemm_tcopy_$(ZGEMM_UNROLL_N)_rvv_max.c
 ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
 ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  zgemm_tcopy_$(ZGEMM_UNROLL_M)_rvv_max.c
 ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-STRMMKERNEL	   =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c
-STRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
-STRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
-STRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
-STRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
-
-DTRMMKERNEL	   =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c
-DTRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
-DTRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
-DTRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
-DTRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
-
-CTRMMKERNEL	   =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c
-CTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
-CTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
-CTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
-CTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
-
-ZTRMMKERNEL	   =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c
-ZTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
-ZTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
-ZTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
-ZTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
+STRMMKERNEL	   =  strmm_kernel_$(STRMM_UNROLL_M)x$(STRMM_UNROLL_N)_zvl128b.c
+STRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(STRMM_UNROLL_M).c
+STRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(STRMM_UNROLL_M).c
+STRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(STRMM_UNROLL_M).c
+STRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(STRMM_UNROLL_M).c
+
+DTRMMKERNEL	   =  dtrmm_kernel_$(DTRMM_UNROLL_M)x$(DTRMM_UNROLL_N)_zvl128b.c
+DTRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(DTRMM_UNROLL_M).c
+DTRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(DTRMM_UNROLL_M).c
+DTRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(DTRMM_UNROLL_M).c
+DTRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(DTRMM_UNROLL_M).c
+
+CTRMMKERNEL	   =  ctrmm_kernel_$(CTRMM_UNROLL_M)x$(CTRMM_UNROLL_N)_zvl128b.c
+CTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(CTRMM_UNROLL_M).c
+CTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(CTRMM_UNROLL_M).c
+CTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(CTRMM_UNROLL_M).c
+CTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(CTRMM_UNROLL_M).c
+
+ZTRMMKERNEL	   =  ztrmm_kernel_$(ZTRMM_UNROLL_M)x$(ZTRMM_UNROLL_N)_zvl128b.c
+ZTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(ZTRMM_UNROLL_M).c
+ZTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(ZTRMM_UNROLL_M).c
+ZTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(ZTRMM_UNROLL_M).c
+ZTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(ZTRMM_UNROLL_M).c
 
 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
index c48095bb21..b95f7ab3e3 100644
--- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
@@ -90,15 +90,15 @@ DGEMVTKERNEL = gemv_t_vector.c
 CGEMVTKERNEL = zgemv_t_vector.c
 ZGEMVTKERNEL = zgemv_t_vector.c
 
-STRMMKERNEL	= strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
-DTRMMKERNEL	= dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
-CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
-ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
+STRMMKERNEL	= strmm_kernel_$(STRMM_UNROLL_M)x$(STRMM_UNROLL_N)_zvl256b.c
+DTRMMKERNEL	= dtrmm_kernel_$(DTRMM_UNROLL_M)x$(DTRMM_UNROLL_N)_zvl256b.c
+CTRMMKERNEL = ctrmm_kernel_$(CTRMM_UNROLL_M)x$(CTRMM_UNROLL_N)_zvl256b.c
+ZTRMMKERNEL = ztrmm_kernel_$(ZTRMM_UNROLL_M)x$(ZTRMM_UNROLL_N)_zvl256b.c
 
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
+SGEMMKERNEL    =  sgemm_kernel_zvl256b.c
 ifneq ($(filter $(SGEMM_UNROLL_N),4 8 16),)
 SGEMMONCOPY    =  gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c
-SGEMMOTCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c
+SGEMMOTCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_N)_rvv_max.c
 else
 SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
@@ -108,7 +108,7 @@ SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 ifneq ($(filter $(SGEMM_UNROLL_M),4 8 16),)
 SGEMMINCOPY    =  gemm_ncopy_$(SGEMM_UNROLL_M)_rvv.c
-SGEMMITCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_M)_rvv.c
+SGEMMITCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_M)_rvv_max.c
 else
 SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
@@ -117,19 +117,20 @@ SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
+DGEMMKERNEL    =  dgemm_kernel_zvl256b.c
 DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
 ifneq ($(filter $(DGEMM_UNROLL_N),4 8 16),)
-DGEMMOTCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c
+DGEMMOTCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_N)_rvv_max.c
 else
 DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
 endif
+
 DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
 DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
 ifneq ($(filter $(DGEMM_UNROLL_M),4 8 16),)
-DGEMMITCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_M)_rvv.c
+DGEMMITCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_M)_rvv_max.c
 else
 DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
 endif
@@ -137,28 +138,28 @@ DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
 DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
+CGEMMKERNEL    =  cgemm_kernel_zvl256b.c
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  zgemm_tcopy_$(CGEMM_UNROLL_N)_rvv_max.c
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
 CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  zgemm_tcopy_$(CGEMM_UNROLL_M)_rvv_max.c
 CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
+ZGEMMKERNEL    =  zgemm_kernel_zvl256b.c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  zgemm_tcopy_$(ZGEMM_UNROLL_N)_rvv_max.c
 ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
 ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  zgemm_tcopy_$(ZGEMM_UNROLL_M)_rvv_max.c
 ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
diff --git a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
deleted file mode 100644
index bd615389c8..0000000000
--- a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
+++ /dev/null
@@ -1,996 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Script: ./kernel/riscv64/generate_kernel.py
-Settings:
- LMUL=2
- M=8
- M_tail_scalar_from=2
- N=4
- __riscv_='__riscv_'
- complex=True
- conjugate=False
- cpu='zvl128b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='float'
- reg_width_bits=128
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=32
- ELEN_PARAM=32
- LMUL_ACC=2
- VFMACC='__riscv_vfmacc_vf_f32m2'
- VFMUL='__riscv_vfmul_vf_f32m2'
- VLEV='__riscv_vle32_v_f32m2'
- VLSEV='__riscv_vlse32_v_f32m2'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
- VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
- VSETVL='__riscv_vsetvl_e32m2'
- VSEV='__riscv_vse32_v_f32m2'
- VSSEV='__riscv_vsse32_v_f32m2'
- acc_vector_t='vfloat32m2_t'
- output='cgemm_kernel_8x4_zvl128b.c'
- param_scalar_t='float'
- param_vector_t='vfloat32m2_t'
-
-*/
-
-#include "common.h"
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
-#define S0 1
-#define S1 -1
-#define S2 1
-#define S3 1
-#define VFMACC_RR __riscv_vfmsac
-#define VFMACC_RI __riscv_vfmacc
-#endif
-#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
-#define S0 1
-#define S1 1
-#define S2 1
-#define S3 -1
-#define VFMACC_RR __riscv_vfmacc
-#define VFMACC_RI __riscv_vfmsac
-#endif
-#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
-#define S0 1
-#define S1 1
-#define S2 -1
-#define S3 1
-#define VFMACC_RR __riscv_vfmacc
-#define VFMACC_RI __riscv_vfnmsac
-#endif
-#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
-#define S0 1
-#define S1 -1
-#define S2 -1
-#define S3 -1
-#define VFMACC_RR __riscv_vfmsac
-#define VFMACC_RI __riscv_vfnmacc
-#endif
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-    // -- MAIN PASS
-
-    for (BLASLONG j = 0; j < N / 4; j += 1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            float B0r = B[bi + 0 * 2 + 0];
-            float B0i = B[bi + 0 * 2 + 1];
-            float B1r = B[bi + 1 * 2 + 0];
-            float B1i = B[bi + 1 * 2 + 1];
-            float B2r = B[bi + 2 * 2 + 0];
-            float B2i = B[bi + 2 * 2 + 1];
-            float B3r = B[bi + 3 * 2 + 0];
-            float B3i = B[bi + 3 * 2 + 1];
-            bi += 4 * 2;
-
-            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 8 * 2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 6 vector registers for temporaries
-            // performing 2 operations between reuses of temporaries
-            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-            vfloat32m2_t ACC0r = tmp0r;
-            vfloat32m2_t ACC0i = tmp0i;
-            vfloat32m2_t ACC1r = tmp1r;
-            vfloat32m2_t ACC1i = tmp1i;
-            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
-            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
-            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
-            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
-            vfloat32m2_t ACC2r = tmp0r;
-            vfloat32m2_t ACC2i = tmp0i;
-            vfloat32m2_t ACC3r = tmp1r;
-            vfloat32m2_t ACC3i = tmp1i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                B1r = B[bi + 1 * 2 + 0];
-                B1i = B[bi + 1 * 2 + 1];
-                B2r = B[bi + 2 * 2 + 0];
-                B2i = B[bi + 2 * 2 + 1];
-                B3r = B[bi + 3 * 2 + 0];
-                B3i = B[bi + 3 * 2 + 1];
-                bi += 4 * 2;
-
-                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 8 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
-                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
-                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
-                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
-                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
-            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
-            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
-            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
-            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
-            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
-            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
-            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
-            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
-            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
-            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
-
-            m_top += 8;
-        }
-
-        // -- tails for main pass
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            float B0r = B[bi + 0 * 2 + 0];
-            float B0i = B[bi + 0 * 2 + 1];
-            float B1r = B[bi + 1 * 2 + 0];
-            float B1i = B[bi + 1 * 2 + 1];
-            float B2r = B[bi + 2 * 2 + 0];
-            float B2i = B[bi + 2 * 2 + 1];
-            float B3r = B[bi + 3 * 2 + 0];
-            float B3i = B[bi + 3 * 2 + 1];
-            bi += 4 * 2;
-
-            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 4 * 2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 6 vector registers for temporaries
-            // performing 2 operations between reuses of temporaries
-            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-            vfloat32m2_t ACC0r = tmp0r;
-            vfloat32m2_t ACC0i = tmp0i;
-            vfloat32m2_t ACC1r = tmp1r;
-            vfloat32m2_t ACC1i = tmp1i;
-            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
-            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
-            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
-            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
-            vfloat32m2_t ACC2r = tmp0r;
-            vfloat32m2_t ACC2i = tmp0i;
-            vfloat32m2_t ACC3r = tmp1r;
-            vfloat32m2_t ACC3i = tmp1i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                B1r = B[bi + 1 * 2 + 0];
-                B1i = B[bi + 1 * 2 + 1];
-                B2r = B[bi + 2 * 2 + 0];
-                B2i = B[bi + 2 * 2 + 1];
-                B3r = B[bi + 3 * 2 + 0];
-                B3i = B[bi + 3 * 2 + 1];
-                bi += 4 * 2;
-
-                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 4 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
-                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
-                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
-                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
-                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
-            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
-            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
-            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
-            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
-            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
-            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
-            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
-            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
-            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
-            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
-
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            float result8 = 0;
-            float result9 = 0;
-            float result10 = 0;
-            float result11 = 0;
-            float result12 = 0;
-            float result13 = 0;
-            float result14 = 0;
-            float result15 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
-                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
-                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
-                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
-                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
-                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
-                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
-                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
-                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
-                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
-                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
-                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
-                ai += 2 * 2;
-                bi += 4 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            float Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result4 * alphar;
-            Ci += result5 * alphar;
-            Cr -= result5 * alphai;
-            Ci += result4 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
-            Cr += result6 * alphar;
-            Ci += result7 * alphar;
-            Cr -= result7 * alphai;
-            Ci += result6 * alphai;
-            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
-            Cr += result8 * alphar;
-            Ci += result9 * alphar;
-            Cr -= result9 * alphai;
-            Ci += result8 * alphai;
-            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 2 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 2 * ldc + 1) * 2 + 1];
-            Cr += result10 * alphar;
-            Ci += result11 * alphar;
-            Cr -= result11 * alphai;
-            Ci += result10 * alphai;
-            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
-            Cr += result12 * alphar;
-            Ci += result13 * alphar;
-            Cr -= result13 * alphai;
-            Ci += result12 * alphai;
-            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 3 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 3 * ldc + 1) * 2 + 1];
-            Cr += result14 * alphar;
-            Ci += result15 * alphar;
-            Cr -= result15 * alphai;
-            Ci += result14 * alphai;
-            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
-                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
-                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
-                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
-                ai += 1 * 2;
-                bi += 4 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            float Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
-            Cr += result4 * alphar;
-            Ci += result5 * alphar;
-            Cr -= result5 * alphai;
-            Ci += result4 * alphai;
-            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
-            Cr += result6 * alphar;
-            Ci += result7 * alphar;
-            Cr -= result7 * alphai;
-            Ci += result6 * alphai;
-            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
-            m_top += 1;
-        }
-
-        n_top += 4;
-    }
-
-    // -- tails for N=2
-
-    if (N & 2) {
-        gvl = __riscv_vsetvl_e32m2(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            float B0r = B[bi + 0 * 2 + 0];
-            float B0i = B[bi + 0 * 2 + 1];
-            float B1r = B[bi + 1 * 2 + 0];
-            float B1i = B[bi + 1 * 2 + 1];
-            bi += 2 * 2;
-
-            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 8 * 2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 10 vector registers for temporaries
-            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-            vfloat32m2_t ACC0r = tmp0r;
-            vfloat32m2_t ACC0i = tmp0i;
-            vfloat32m2_t ACC1r = tmp1r;
-            vfloat32m2_t ACC1i = tmp1i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                B1r = B[bi + 1 * 2 + 0];
-                B1i = B[bi + 1 * 2 + 1];
-                bi += 2 * 2;
-
-                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 8 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
-            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
-            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
-
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            float B0r = B[bi + 0 * 2 + 0];
-            float B0i = B[bi + 0 * 2 + 1];
-            float B1r = B[bi + 1 * 2 + 0];
-            float B1i = B[bi + 1 * 2 + 1];
-            bi += 2 * 2;
-
-            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 4 * 2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 10 vector registers for temporaries
-            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-            vfloat32m2_t ACC0r = tmp0r;
-            vfloat32m2_t ACC0i = tmp0i;
-            vfloat32m2_t ACC1r = tmp1r;
-            vfloat32m2_t ACC1i = tmp1i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                B1r = B[bi + 1 * 2 + 0];
-                B1i = B[bi + 1 * 2 + 1];
-                bi += 2 * 2;
-
-                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 4 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
-            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
-            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
-
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
-                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
-                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
-                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
-                ai += 2 * 2;
-                bi += 2 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            float Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result4 * alphar;
-            Ci += result5 * alphar;
-            Cr -= result5 * alphai;
-            Ci += result4 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
-            Cr += result6 * alphar;
-            Ci += result7 * alphar;
-            Cr -= result7 * alphai;
-            Ci += result6 * alphai;
-            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                ai += 1 * 2;
-                bi += 2 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            float Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            m_top += 1;
-        }
-
-        n_top += 2;
-    }
-
-    // -- tails for N=1
-
-    if (N & 1) {
-        gvl = __riscv_vsetvl_e32m2(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            float B0r = B[bi + 0 * 2 + 0];
-            float B0i = B[bi + 0 * 2 + 1];
-            bi += 1 * 2;
-
-            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 8 * 2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 12 vector registers for temporaries
-            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            vfloat32m2_t ACC0r = tmp0r;
-            vfloat32m2_t ACC0i = tmp0i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                bi += 1 * 2;
-
-                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 8 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            float B0r = B[bi + 0 * 2 + 0];
-            float B0i = B[bi + 0 * 2 + 1];
-            bi += 1 * 2;
-
-            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 4 * 2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 12 vector registers for temporaries
-            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            vfloat32m2_t ACC0r = tmp0r;
-            vfloat32m2_t ACC0i = tmp0i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                bi += 1 * 2;
-
-                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 4 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
-                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
-                ai += 2 * 2;
-                bi += 1 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            float Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            float result1 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                ai += 1 * 2;
-                bi += 1 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            float Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            m_top += 1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
diff --git a/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
deleted file mode 100644
index 7980c029a4..0000000000
--- a/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
+++ /dev/null
@@ -1,1931 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Settings:
- LMUL=1
- M=8
- M_tail_scalar_from=1
- N=8
- __riscv_='__riscv_'
- complex=True
- conjugate=False
- cpu='zvl256b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='float'
- reg_width_bits=256
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=32
- ELEN_PARAM=32
- LMUL_ACC=1
- VFMACC='__riscv_vfmacc_vf_f32m1'
- VFMUL='__riscv_vfmul_vf_f32m1'
- VLEV='__riscv_vle32_v_f32m1'
- VLSEV='__riscv_vlse32_v_f32m1'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
- VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
- VSETVL='__riscv_vsetvl_e32m1'
- VSEV='__riscv_vse32_v_f32m1'
- VSSEV='__riscv_vsse32_v_f32m1'
- acc_vector_t='vfloat32m1_t'
- output='cgemm_kernel_8x8_zvl256b.c'
- param_scalar_t='float'
- param_vector_t='vfloat32m1_t'
-
-*/
-
-#include "common.h"
-
-
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-    #define S0  1
-    #define S1 -1
-    #define S2  1
-    #define S3  1
-    #define VFMACC_RR __riscv_vfmsac
-    #define VFMACC_RI __riscv_vfmacc
-#endif
-#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
-    #define S0  1
-    #define S1  1
-    #define S2  1
-    #define S3 -1
-    #define VFMACC_RR __riscv_vfmacc
-    #define VFMACC_RI __riscv_vfmsac
-#endif
-#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
-    #define S0  1
-    #define S1  1
-    #define S2 -1
-    #define S3  1
-    #define VFMACC_RR __riscv_vfmacc
-    #define VFMACC_RI __riscv_vfnmsac
-#endif
-#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
-    #define S0  1
-    #define S1 -1
-    #define S2 -1
-    #define S3 -1
-    #define VFMACC_RR __riscv_vfmsac
-    #define VFMACC_RI __riscv_vfnmacc
-#endif
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-
-    // -- MAIN PASS
-
-    for (BLASLONG j=0; j<N/8; j+=1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
-
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            float B2r = B[bi+2*2+0];
-            float B2i = B[bi+2*2+1];
-            float B3r = B[bi+3*2+0];
-            float B3i = B[bi+3*2+1];
-            float B4r = B[bi+4*2+0];
-            float B4i = B[bi+4*2+1];
-            float B5r = B[bi+5*2+0];
-            float B5i = B[bi+5*2+1];
-            float B6r = B[bi+6*2+0];
-            float B6i = B[bi+6*2+1];
-            float B7r = B[bi+7*2+0];
-            float B7i = B[bi+7*2+1];
-            bi += 8*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
-            // leaving 14 vector registers for temporaries
-            // performing 4 operations between reuses of temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-            vfloat32m1_t ACC2r = tmp2r;
-            vfloat32m1_t ACC2i = tmp2i;
-            vfloat32m1_t ACC3r = tmp3r;
-            vfloat32m1_t ACC3i = tmp3i;
-            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
-            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
-            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
-            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
-            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
-            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
-            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
-            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
-            vfloat32m1_t ACC4r = tmp0r;
-            vfloat32m1_t ACC4i = tmp0i;
-            vfloat32m1_t ACC5r = tmp1r;
-            vfloat32m1_t ACC5i = tmp1i;
-            vfloat32m1_t ACC6r = tmp2r;
-            vfloat32m1_t ACC6i = tmp2i;
-            vfloat32m1_t ACC7r = tmp3r;
-            vfloat32m1_t ACC7i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                B4r = B[bi+4*2+0];
-                B4i = B[bi+4*2+1];
-                B5r = B[bi+5*2+0];
-                B5i = B[bi+5*2+1];
-                B6r = B[bi+6*2+0];
-                B6i = B[bi+6*2+1];
-                B7r = B[bi+7*2+0];
-                B7i = B[bi+7*2+1];
-                bi += 8*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
-                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
-                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
-                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
-                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
-                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
-                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
-                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
-                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
-            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
-            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
-            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
-            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
-            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
-            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
-            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
-            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
-            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
-            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
-            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
-            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
-            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
-            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
-            
-            m_top += 8;
-        }
-
-
-
-        // -- tails for main pass
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            float B2r = B[bi+2*2+0];
-            float B2i = B[bi+2*2+1];
-            float B3r = B[bi+3*2+0];
-            float B3i = B[bi+3*2+1];
-            float B4r = B[bi+4*2+0];
-            float B4i = B[bi+4*2+1];
-            float B5r = B[bi+5*2+0];
-            float B5i = B[bi+5*2+1];
-            float B6r = B[bi+6*2+0];
-            float B6i = B[bi+6*2+1];
-            float B7r = B[bi+7*2+0];
-            float B7i = B[bi+7*2+1];
-            bi += 8*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
-            // leaving 14 vector registers for temporaries
-            // performing 4 operations between reuses of temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-            vfloat32m1_t ACC2r = tmp2r;
-            vfloat32m1_t ACC2i = tmp2i;
-            vfloat32m1_t ACC3r = tmp3r;
-            vfloat32m1_t ACC3i = tmp3i;
-            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
-            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
-            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
-            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
-            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
-            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
-            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
-            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
-            vfloat32m1_t ACC4r = tmp0r;
-            vfloat32m1_t ACC4i = tmp0i;
-            vfloat32m1_t ACC5r = tmp1r;
-            vfloat32m1_t ACC5i = tmp1i;
-            vfloat32m1_t ACC6r = tmp2r;
-            vfloat32m1_t ACC6i = tmp2i;
-            vfloat32m1_t ACC7r = tmp3r;
-            vfloat32m1_t ACC7i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                B4r = B[bi+4*2+0];
-                B4i = B[bi+4*2+1];
-                B5r = B[bi+5*2+0];
-                B5i = B[bi+5*2+1];
-                B6r = B[bi+6*2+0];
-                B6i = B[bi+6*2+1];
-                B7r = B[bi+7*2+0];
-                B7i = B[bi+7*2+1];
-                bi += 8*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
-                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
-                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
-                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
-                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
-                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
-                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
-                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
-                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
-            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
-            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
-            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
-            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
-            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
-            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
-            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
-            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
-            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
-            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
-            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
-            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
-            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
-            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e32m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            float B2r = B[bi+2*2+0];
-            float B2i = B[bi+2*2+1];
-            float B3r = B[bi+3*2+0];
-            float B3i = B[bi+3*2+1];
-            float B4r = B[bi+4*2+0];
-            float B4i = B[bi+4*2+1];
-            float B5r = B[bi+5*2+0];
-            float B5i = B[bi+5*2+1];
-            float B6r = B[bi+6*2+0];
-            float B6i = B[bi+6*2+1];
-            float B7r = B[bi+7*2+0];
-            float B7i = B[bi+7*2+1];
-            bi += 8*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
-            // leaving 14 vector registers for temporaries
-            // performing 4 operations between reuses of temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-            vfloat32m1_t ACC2r = tmp2r;
-            vfloat32m1_t ACC2i = tmp2i;
-            vfloat32m1_t ACC3r = tmp3r;
-            vfloat32m1_t ACC3i = tmp3i;
-            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
-            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
-            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
-            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
-            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
-            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
-            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
-            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
-            vfloat32m1_t ACC4r = tmp0r;
-            vfloat32m1_t ACC4i = tmp0i;
-            vfloat32m1_t ACC5r = tmp1r;
-            vfloat32m1_t ACC5i = tmp1i;
-            vfloat32m1_t ACC6r = tmp2r;
-            vfloat32m1_t ACC6i = tmp2i;
-            vfloat32m1_t ACC7r = tmp3r;
-            vfloat32m1_t ACC7i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                B4r = B[bi+4*2+0];
-                B4i = B[bi+4*2+1];
-                B5r = B[bi+5*2+0];
-                B5i = B[bi+5*2+1];
-                B6r = B[bi+6*2+0];
-                B6i = B[bi+6*2+1];
-                B7r = B[bi+7*2+0];
-                B7i = B[bi+7*2+1];
-                bi += 8*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
-                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
-                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
-                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
-                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
-                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
-                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
-                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
-                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
-            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
-            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
-            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
-            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
-            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
-            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
-            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
-            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
-            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
-            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
-            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
-            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
-            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
-            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            float result8 = 0;
-            float result9 = 0;
-            float result10 = 0;
-            float result11 = 0;
-            float result12 = 0;
-            float result13 = 0;
-            float result14 = 0;
-            float result15 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
-                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
-                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
-                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
-                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
-                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
-                result8+=S0*A[ai+0+0]*B[bi+8+0] + S1*A[ai+0+1]*B[bi+8+1];
-                result9+=S2*A[ai+0+1]*B[bi+8+0] + S3*A[ai+0+0]*B[bi+8+1];
-                result10+=S0*A[ai+0+0]*B[bi+10+0] + S1*A[ai+0+1]*B[bi+10+1];
-                result11+=S2*A[ai+0+1]*B[bi+10+0] + S3*A[ai+0+0]*B[bi+10+1];
-                result12+=S0*A[ai+0+0]*B[bi+12+0] + S1*A[ai+0+1]*B[bi+12+1];
-                result13+=S2*A[ai+0+1]*B[bi+12+0] + S3*A[ai+0+0]*B[bi+12+1];
-                result14+=S0*A[ai+0+0]*B[bi+14+0] + S1*A[ai+0+1]*B[bi+14+1];
-                result15+=S2*A[ai+0+1]*B[bi+14+0] + S3*A[ai+0+0]*B[bi+14+1];
-                ai+=1*2;
-                bi+=8*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            float Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+1*ldc+0)*2+0];
-            Ci = C[(ci+1*ldc+0)*2+1];
-            Cr += result2*alphar;
-            Ci += result3*alphar;
-            Cr -= result3*alphai;
-            Ci += result2*alphai;
-            C[(ci+1*ldc+0)*2+0] = Cr;
-            C[(ci+1*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+2*ldc+0)*2+0];
-            Ci = C[(ci+2*ldc+0)*2+1];
-            Cr += result4*alphar;
-            Ci += result5*alphar;
-            Cr -= result5*alphai;
-            Ci += result4*alphai;
-            C[(ci+2*ldc+0)*2+0] = Cr;
-            C[(ci+2*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+3*ldc+0)*2+0];
-            Ci = C[(ci+3*ldc+0)*2+1];
-            Cr += result6*alphar;
-            Ci += result7*alphar;
-            Cr -= result7*alphai;
-            Ci += result6*alphai;
-            C[(ci+3*ldc+0)*2+0] = Cr;
-            C[(ci+3*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+4*ldc+0)*2+0];
-            Ci = C[(ci+4*ldc+0)*2+1];
-            Cr += result8*alphar;
-            Ci += result9*alphar;
-            Cr -= result9*alphai;
-            Ci += result8*alphai;
-            C[(ci+4*ldc+0)*2+0] = Cr;
-            C[(ci+4*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+5*ldc+0)*2+0];
-            Ci = C[(ci+5*ldc+0)*2+1];
-            Cr += result10*alphar;
-            Ci += result11*alphar;
-            Cr -= result11*alphai;
-            Ci += result10*alphai;
-            C[(ci+5*ldc+0)*2+0] = Cr;
-            C[(ci+5*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+6*ldc+0)*2+0];
-            Ci = C[(ci+6*ldc+0)*2+1];
-            Cr += result12*alphar;
-            Ci += result13*alphar;
-            Cr -= result13*alphai;
-            Ci += result12*alphai;
-            C[(ci+6*ldc+0)*2+0] = Cr;
-            C[(ci+6*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+7*ldc+0)*2+0];
-            Ci = C[(ci+7*ldc+0)*2+1];
-            Cr += result14*alphar;
-            Ci += result15*alphar;
-            Cr -= result15*alphai;
-            Ci += result14*alphai;
-            C[(ci+7*ldc+0)*2+0] = Cr;
-            C[(ci+7*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 8;
-    }
-
-
-
-    // -- tails for N=4
-
-    if( N & 4 ) {
-        gvl = __riscv_vsetvl_e32m1(8);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            float B2r = B[bi+2*2+0];
-            float B2i = B[bi+2*2+1];
-            float B3r = B[bi+3*2+0];
-            float B3i = B[bi+3*2+1];
-            bi += 4*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 22 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-            vfloat32m1_t ACC2r = tmp2r;
-            vfloat32m1_t ACC2i = tmp2i;
-            vfloat32m1_t ACC3r = tmp3r;
-            vfloat32m1_t ACC3i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                bi += 4*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            float B2r = B[bi+2*2+0];
-            float B2i = B[bi+2*2+1];
-            float B3r = B[bi+3*2+0];
-            float B3i = B[bi+3*2+1];
-            bi += 4*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 22 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-            vfloat32m1_t ACC2r = tmp2r;
-            vfloat32m1_t ACC2i = tmp2i;
-            vfloat32m1_t ACC3r = tmp3r;
-            vfloat32m1_t ACC3i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                bi += 4*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e32m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            float B2r = B[bi+2*2+0];
-            float B2i = B[bi+2*2+1];
-            float B3r = B[bi+3*2+0];
-            float B3i = B[bi+3*2+1];
-            bi += 4*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 22 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-            vfloat32m1_t ACC2r = tmp2r;
-            vfloat32m1_t ACC2i = tmp2i;
-            vfloat32m1_t ACC3r = tmp3r;
-            vfloat32m1_t ACC3i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                bi += 4*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
-                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
-                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
-                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
-                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
-                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
-                ai+=1*2;
-                bi+=4*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            float Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+1*ldc+0)*2+0];
-            Ci = C[(ci+1*ldc+0)*2+1];
-            Cr += result2*alphar;
-            Ci += result3*alphar;
-            Cr -= result3*alphai;
-            Ci += result2*alphai;
-            C[(ci+1*ldc+0)*2+0] = Cr;
-            C[(ci+1*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+2*ldc+0)*2+0];
-            Ci = C[(ci+2*ldc+0)*2+1];
-            Cr += result4*alphar;
-            Ci += result5*alphar;
-            Cr -= result5*alphai;
-            Ci += result4*alphai;
-            C[(ci+2*ldc+0)*2+0] = Cr;
-            C[(ci+2*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+3*ldc+0)*2+0];
-            Ci = C[(ci+3*ldc+0)*2+1];
-            Cr += result6*alphar;
-            Ci += result7*alphar;
-            Cr -= result7*alphai;
-            Ci += result6*alphai;
-            C[(ci+3*ldc+0)*2+0] = Cr;
-            C[(ci+3*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 4;
-    }
-
-
-
-    // -- tails for N=2
-
-    if( N & 2 ) {
-        gvl = __riscv_vsetvl_e32m1(8);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            bi += 2*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 26 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                bi += 2*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            bi += 2*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 26 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                bi += 2*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e32m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            bi += 2*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 26 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                bi += 2*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
-                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
-                ai+=1*2;
-                bi+=2*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            float Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+1*ldc+0)*2+0];
-            Ci = C[(ci+1*ldc+0)*2+1];
-            Cr += result2*alphar;
-            Ci += result3*alphar;
-            Cr -= result3*alphai;
-            Ci += result2*alphai;
-            C[(ci+1*ldc+0)*2+0] = Cr;
-            C[(ci+1*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 2;
-    }
-
-
-
-    // -- tails for N=1
-
-    if( N & 1 ) {
-        gvl = __riscv_vsetvl_e32m1(8);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            bi += 1*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 28 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                bi += 1*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            bi += 1*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 28 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                bi += 1*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e32m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            bi += 1*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 28 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                bi += 1*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                ai+=1*2;
-                bi+=1*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            float Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
-
diff --git a/kernel/riscv64/cgemm_kernel_zvl128b.c b/kernel/riscv64/cgemm_kernel_zvl128b.c
new file mode 100644
index 0000000000..6e9ac386d5
--- /dev/null
+++ b/kernel/riscv64/cgemm_kernel_zvl128b.c
@@ -0,0 +1,52 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE CGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE CGEMM_DEFAULT_UNROLL_N
+
+#if M_BLOCKSIZE == 4
+#define RVV_MUL __riscv_vfmul_vf_f32m1
+#define RVV_LOAD __riscv_vlse32_v_f32m1
+#define RVV_STORE __riscv_vsse32_v_f32m1
+#define VECTOR_T vfloat32m1_t
+#elif M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f32m2
+#define RVV_LOAD __riscv_vlse32_v_f32m2
+#define RVV_STORE __riscv_vsse32_v_f32m2
+#define VECTOR_T vfloat32m2_t
+#elif M_BLOCKSIZE == 16
+#define RVV_MUL __riscv_vfmul_vf_f32m4
+#define RVV_LOAD __riscv_vlse32_v_f32m4
+#define RVV_STORE __riscv_vsse32_v_f32m4
+#define VECTOR_T vfloat32m4_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "zgemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/cgemm_kernel_zvl256b.c b/kernel/riscv64/cgemm_kernel_zvl256b.c
new file mode 100644
index 0000000000..729b99c7e2
--- /dev/null
+++ b/kernel/riscv64/cgemm_kernel_zvl256b.c
@@ -0,0 +1,48 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+
+#define M_BLOCKSIZE CGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE CGEMM_DEFAULT_UNROLL_N
+
+#if M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f32m1
+#define RVV_LOAD __riscv_vlse32_v_f32m1
+#define RVV_STORE __riscv_vsse32_v_f32m1
+#define VECTOR_T vfloat32m1_t
+#elif M_BLOCKSIZE == 16
+#define RVV_MUL __riscv_vfmul_vf_f32m2
+#define RVV_LOAD __riscv_vlse32_v_f32m2
+#define RVV_STORE __riscv_vsse32_v_f32m2
+#define VECTOR_T vfloat32m2_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "zgemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
deleted file mode 100644
index a613f0bceb..0000000000
--- a/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
+++ /dev/null
@@ -1,492 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Script: ./kernel/riscv64/generate_kernel.py
-Settings:
- LMUL=4
- M=8
- M_tail_scalar_from=2
- N=4
- __riscv_='__riscv_'
- complex=False
- conjugate=False
- cpu='zvl128b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='double'
- reg_width_bits=128
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=64
- ELEN_PARAM=64
- LMUL_ACC=4
- VFMACC='__riscv_vfmacc_vf_f64m4'
- VFMUL='__riscv_vfmul_vf_f64m4'
- VLEV='__riscv_vle64_v_f64m4'
- VLSEV='__riscv_vlse64_v_f64m4'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4'
- VMUL_TO_ACC='__riscv_vfmul_vf_f64m4'
- VSETVL='__riscv_vsetvl_e64m4'
- VSEV='__riscv_vse64_v_f64m4'
- VSSEV='__riscv_vsse64_v_f64m4'
- acc_vector_t='vfloat64m4_t'
- output='dgemm_kernel_8x4_zvl128b.c'
- param_scalar_t='double'
- param_vector_t='vfloat64m4_t'
-
-*/
-
-#include "common.h"
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-    // -- MAIN PASS
-
-    for (BLASLONG j = 0; j < N / 4; j += 1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e64m4(8);
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            double B0 = B[bi + 0];
-            double B1 = B[bi + 1];
-            double B2 = B[bi + 2];
-            double B3 = B[bi + 3];
-            bi += 4;
-
-            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
-            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
-            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
-            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                B2 = B[bi + 2];
-                B3 = B[bi + 3];
-                bi += 4;
-
-                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
-            m_top += 8;
-        }
-
-        // -- tails for main pass
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e64m4(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            double B0 = B[bi + 0];
-            double B1 = B[bi + 1];
-            double B2 = B[bi + 2];
-            double B3 = B[bi + 3];
-            bi += 4;
-
-            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
-            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
-            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
-            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                B2 = B[bi + 2];
-                B3 = B[bi + 3];
-                bi += 4;
-
-                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                result2 += A[ai + 0] * B[bi + 1];
-                result3 += A[ai + 1] * B[bi + 1];
-                result4 += A[ai + 0] * B[bi + 2];
-                result5 += A[ai + 1] * B[bi + 2];
-                result6 += A[ai + 0] * B[bi + 3];
-                result7 += A[ai + 1] * B[bi + 3];
-                ai += 2;
-                bi += 4;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            C[ci + 1 * ldc + 0] += alpha * result2;
-            C[ci + 1 * ldc + 1] += alpha * result3;
-            C[ci + 2 * ldc + 0] += alpha * result4;
-            C[ci + 2 * ldc + 1] += alpha * result5;
-            C[ci + 3 * ldc + 0] += alpha * result6;
-            C[ci + 3 * ldc + 1] += alpha * result7;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 0] * B[bi + 1];
-                result2 += A[ai + 0] * B[bi + 2];
-                result3 += A[ai + 0] * B[bi + 3];
-                ai += 1;
-                bi += 4;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 1 * ldc + 0] += alpha * result1;
-            C[ci + 2 * ldc + 0] += alpha * result2;
-            C[ci + 3 * ldc + 0] += alpha * result3;
-            m_top += 1;
-        }
-
-        n_top += 4;
-    }
-
-    // -- tails for N=2
-
-    if (N & 2) {
-        gvl = __riscv_vsetvl_e64m4(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            double B0 = B[bi + 0];
-            double B1 = B[bi + 1];
-            bi += 2;
-
-            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
-            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                bi += 2;
-
-                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e64m4(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            double B0 = B[bi + 0];
-            double B1 = B[bi + 1];
-            bi += 2;
-
-            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
-            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                bi += 2;
-
-                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                result2 += A[ai + 0] * B[bi + 1];
-                result3 += A[ai + 1] * B[bi + 1];
-                ai += 2;
-                bi += 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            C[ci + 1 * ldc + 0] += alpha * result2;
-            C[ci + 1 * ldc + 1] += alpha * result3;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            double result0 = 0;
-            double result1 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 0] * B[bi + 1];
-                ai += 1;
-                bi += 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 1 * ldc + 0] += alpha * result1;
-            m_top += 1;
-        }
-
-        n_top += 2;
-    }
-
-    // -- tails for N=1
-
-    if (N & 1) {
-        gvl = __riscv_vsetvl_e64m4(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            double B0 = B[bi + 0];
-            bi += 1;
-
-            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                bi += 1;
-
-                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e64m4(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            double B0 = B[bi + 0];
-            bi += 1;
-
-            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                bi += 1;
-
-                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            double result0 = 0;
-            double result1 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                ai += 2;
-                bi += 1;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            double result0 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                ai += 1;
-                bi += 1;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            m_top += 1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
diff --git a/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
deleted file mode 100644
index 760bfc8932..0000000000
--- a/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
+++ /dev/null
@@ -1,860 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Settings:
- LMUL=1
- M=8
- M_tail_scalar_from=2
- N=8
- __riscv_='__riscv_'
- complex=False
- conjugate=False
- cpu='zvl256b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='double'
- reg_width_bits=256
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=64
- ELEN_PARAM=64
- LMUL_ACC=1
- VFMACC='__riscv_vfmacc_vf_f64m1'
- VFMUL='__riscv_vfmul_vf_f64m1'
- VLEV='__riscv_vle64_v_f64m1'
- VLSEV='__riscv_vlse64_v_f64m1'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
- VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
- VSETVL='__riscv_vsetvl_e64m1'
- VSEV='__riscv_vse64_v_f64m1'
- VSSEV='__riscv_vsse64_v_f64m1'
- acc_vector_t='vfloat64m1_t'
- output='dgemm_kernel_8x8_zvl256b.c'
- param_scalar_t='double'
- param_vector_t='vfloat64m1_t'
-
-*/
-
-#include "common.h"
-
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-
-    // -- MAIN PASS
-
-    for (BLASLONG j=0; j<N/8; j+=1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
-
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            double B1 = B[bi+1];
-            double B2 = B[bi+2];
-            double B3 = B[bi+3];
-            double B4 = B[bi+4];
-            double B5 = B[bi+5];
-            double B6 = B[bi+6];
-            double B7 = B[bi+7];
-            bi += 8;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-            ai += 8;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
-            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
-            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
-            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
-            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
-            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
-            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
-            vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
-            vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl);
-            vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
-            vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl);
-            vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
-            vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl);
-            vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
-            vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                B4 = B[bi+4];
-                B5 = B[bi+5];
-                B6 = B[bi+6];
-                B7 = B[bi+7];
-                bi += 8;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
-                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
-                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
-                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
-                result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl);
-                result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl);
-                result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl);
-                result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl);
-                result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl);
-                result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl);
-                result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl);
-                result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c8 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c9 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c10 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c11 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c12 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c13 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c14 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c15 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
-            c8 = __riscv_vfmacc_vf_f64m1( c8, alpha, result8, gvl );
-            c9 = __riscv_vfmacc_vf_f64m1( c9, alpha, result9, gvl );
-            c10 = __riscv_vfmacc_vf_f64m1( c10, alpha, result10, gvl );
-            c11 = __riscv_vfmacc_vf_f64m1( c11, alpha, result11, gvl );
-            c12 = __riscv_vfmacc_vf_f64m1( c12, alpha, result12, gvl );
-            c13 = __riscv_vfmacc_vf_f64m1( c13, alpha, result13, gvl );
-            c14 = __riscv_vfmacc_vf_f64m1( c14, alpha, result14, gvl );
-            c15 = __riscv_vfmacc_vf_f64m1( c15, alpha, result15, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c15, gvl);
-            m_top += 8;
-        }
-
-
-
-        // -- tails for main pass
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            double B1 = B[bi+1];
-            double B2 = B[bi+2];
-            double B3 = B[bi+3];
-            double B4 = B[bi+4];
-            double B5 = B[bi+5];
-            double B6 = B[bi+6];
-            double B7 = B[bi+7];
-            bi += 8;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
-            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
-            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
-            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
-            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
-            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
-            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                B4 = B[bi+4];
-                B5 = B[bi+5];
-                B6 = B[bi+6];
-                B7 = B[bi+7];
-                bi += 8;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
-                result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl);
-                result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            double result8 = 0;
-            double result9 = 0;
-            double result10 = 0;
-            double result11 = 0;
-            double result12 = 0;
-            double result13 = 0;
-            double result14 = 0;
-            double result15 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                result2+=A[ai+0]*B[bi+1];
-                result3+=A[ai+1]*B[bi+1];
-                result4+=A[ai+0]*B[bi+2];
-                result5+=A[ai+1]*B[bi+2];
-                result6+=A[ai+0]*B[bi+3];
-                result7+=A[ai+1]*B[bi+3];
-                result8+=A[ai+0]*B[bi+4];
-                result9+=A[ai+1]*B[bi+4];
-                result10+=A[ai+0]*B[bi+5];
-                result11+=A[ai+1]*B[bi+5];
-                result12+=A[ai+0]*B[bi+6];
-                result13+=A[ai+1]*B[bi+6];
-                result14+=A[ai+0]*B[bi+7];
-                result15+=A[ai+1]*B[bi+7];
-                ai+=2;
-                bi+=8;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            C[ci+1*ldc+0] += alpha * result2;
-            C[ci+1*ldc+1] += alpha * result3;
-            C[ci+2*ldc+0] += alpha * result4;
-            C[ci+2*ldc+1] += alpha * result5;
-            C[ci+3*ldc+0] += alpha * result6;
-            C[ci+3*ldc+1] += alpha * result7;
-            C[ci+4*ldc+0] += alpha * result8;
-            C[ci+4*ldc+1] += alpha * result9;
-            C[ci+5*ldc+0] += alpha * result10;
-            C[ci+5*ldc+1] += alpha * result11;
-            C[ci+6*ldc+0] += alpha * result12;
-            C[ci+6*ldc+1] += alpha * result13;
-            C[ci+7*ldc+0] += alpha * result14;
-            C[ci+7*ldc+1] += alpha * result15;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+0]*B[bi+1];
-                result2+=A[ai+0]*B[bi+2];
-                result3+=A[ai+0]*B[bi+3];
-                result4+=A[ai+0]*B[bi+4];
-                result5+=A[ai+0]*B[bi+5];
-                result6+=A[ai+0]*B[bi+6];
-                result7+=A[ai+0]*B[bi+7];
-                ai+=1;
-                bi+=8;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+1*ldc+0] += alpha * result1;
-            C[ci+2*ldc+0] += alpha * result2;
-            C[ci+3*ldc+0] += alpha * result3;
-            C[ci+4*ldc+0] += alpha * result4;
-            C[ci+5*ldc+0] += alpha * result5;
-            C[ci+6*ldc+0] += alpha * result6;
-            C[ci+7*ldc+0] += alpha * result7;
-            m_top+=1;
-        }
-
-        n_top += 8;
-    }
-
-
-
-    // -- tails for N=4
-
-    if( N & 4 ) {
-        gvl = __riscv_vsetvl_e64m1(4);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            double B1 = B[bi+1];
-            double B2 = B[bi+2];
-            double B3 = B[bi+3];
-            bi += 4;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-            ai += 8;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
-            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
-            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
-            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
-            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
-            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
-            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                bi += 4;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
-                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
-                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
-                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            double B1 = B[bi+1];
-            double B2 = B[bi+2];
-            double B3 = B[bi+3];
-            bi += 4;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
-            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
-            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                bi += 4;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                result2+=A[ai+0]*B[bi+1];
-                result3+=A[ai+1]*B[bi+1];
-                result4+=A[ai+0]*B[bi+2];
-                result5+=A[ai+1]*B[bi+2];
-                result6+=A[ai+0]*B[bi+3];
-                result7+=A[ai+1]*B[bi+3];
-                ai+=2;
-                bi+=4;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            C[ci+1*ldc+0] += alpha * result2;
-            C[ci+1*ldc+1] += alpha * result3;
-            C[ci+2*ldc+0] += alpha * result4;
-            C[ci+2*ldc+1] += alpha * result5;
-            C[ci+3*ldc+0] += alpha * result6;
-            C[ci+3*ldc+1] += alpha * result7;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+0]*B[bi+1];
-                result2+=A[ai+0]*B[bi+2];
-                result3+=A[ai+0]*B[bi+3];
-                ai+=1;
-                bi+=4;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+1*ldc+0] += alpha * result1;
-            C[ci+2*ldc+0] += alpha * result2;
-            C[ci+3*ldc+0] += alpha * result3;
-            m_top+=1;
-        }
-
-        n_top += 4;
-    }
-
-
-
-    // -- tails for N=2
-
-    if( N & 2 ) {
-        gvl = __riscv_vsetvl_e64m1(4);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            double B1 = B[bi+1];
-            bi += 2;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-            ai += 8;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
-            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
-            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                bi += 2;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
-                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            double B1 = B[bi+1];
-            bi += 2;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                bi += 2;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                result2+=A[ai+0]*B[bi+1];
-                result3+=A[ai+1]*B[bi+1];
-                ai+=2;
-                bi+=2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            C[ci+1*ldc+0] += alpha * result2;
-            C[ci+1*ldc+1] += alpha * result3;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            double result1 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+0]*B[bi+1];
-                ai+=1;
-                bi+=2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+1*ldc+0] += alpha * result1;
-            m_top+=1;
-        }
-
-        n_top += 2;
-    }
-
-
-
-    // -- tails for N=1
-
-    if( N & 1 ) {
-        gvl = __riscv_vsetvl_e64m1(4);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            bi += 1;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-            ai += 8;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                bi += 1;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            bi += 1;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                bi += 1;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            double result0 = 0;
-            double result1 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                ai+=2;
-                bi+=1;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                ai+=1;
-                bi+=1;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            m_top+=1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
-
diff --git a/kernel/riscv64/dgemm_kernel_zvl128b.c b/kernel/riscv64/dgemm_kernel_zvl128b.c
new file mode 100644
index 0000000000..d25f57588a
--- /dev/null
+++ b/kernel/riscv64/dgemm_kernel_zvl128b.c
@@ -0,0 +1,55 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE DGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE DGEMM_DEFAULT_UNROLL_N
+
+#if M_BLOCKSIZE == 2
+#define RVV_MUL __riscv_vfmul_vf_f64m1
+#define RVV_MACC __riscv_vfmacc_vf_f64m1
+#define RVV_LOAD __riscv_vle64_v_f64m1
+#define RVV_STORE __riscv_vse64_v_f64m1
+#define VECTOR_T vfloat64m1_t
+#elif M_BLOCKSIZE == 4
+#define RVV_MUL __riscv_vfmul_vf_f64m2
+#define RVV_MACC __riscv_vfmacc_vf_f64m2
+#define RVV_LOAD __riscv_vle64_v_f64m2
+#define RVV_STORE __riscv_vse64_v_f64m2
+#define VECTOR_T vfloat64m2_t
+#elif M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f64m4
+#define RVV_MACC __riscv_vfmacc_vf_f64m4
+#define RVV_LOAD __riscv_vle64_v_f64m4
+#define RVV_STORE __riscv_vse64_v_f64m4
+#define VECTOR_T vfloat64m4_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "gemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/dgemm_kernel_zvl256b.c b/kernel/riscv64/dgemm_kernel_zvl256b.c
new file mode 100644
index 0000000000..844c91e93e
--- /dev/null
+++ b/kernel/riscv64/dgemm_kernel_zvl256b.c
@@ -0,0 +1,55 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE DGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE DGEMM_DEFAULT_UNROLL_N
+
+#if M_BLOCKSIZE == 4
+#define RVV_MUL __riscv_vfmul_vf_f64m1
+#define RVV_MACC __riscv_vfmacc_vf_f64m1
+#define RVV_LOAD __riscv_vle64_v_f64m1
+#define RVV_STORE __riscv_vse64_v_f64m1
+#define VECTOR_T vfloat64m1_t
+#elif M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f64m2
+#define RVV_MACC __riscv_vfmacc_vf_f64m2
+#define RVV_LOAD __riscv_vle64_v_f64m2
+#define RVV_STORE __riscv_vse64_v_f64m2
+#define VECTOR_T vfloat64m2_t
+#elif M_BLOCKSIZE == 16
+#define RVV_MUL __riscv_vfmul_vf_f64m4
+#define RVV_MACC __riscv_vfmacc_vf_f64m4
+#define RVV_LOAD __riscv_vle64_v_f64m4
+#define RVV_STORE __riscv_vse64_v_f64m4
+#define VECTOR_T vfloat64m4_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "gemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/gemm_kernel_rvv_vlv_common.h b/kernel/riscv64/gemm_kernel_rvv_vlv_common.h
new file mode 100644
index 0000000000..31c2f0fdc6
--- /dev/null
+++ b/kernel/riscv64/gemm_kernel_rvv_vlv_common.h
@@ -0,0 +1,179 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define RISCV_REPEAT(INSN, BLEN, ...) \
+do { \
+    INSN (0, __VA_ARGS__); \
+    if (BLEN == 1) break; \
+    INSN (1, __VA_ARGS__); \
+    if (BLEN == 2) break; \
+    INSN (2, __VA_ARGS__); \
+    INSN (3, __VA_ARGS__); \
+    if (BLEN == 4) break; \
+    INSN (4, __VA_ARGS__); \
+    INSN (5, __VA_ARGS__); \
+    INSN (6, __VA_ARGS__); \
+    INSN (7, __VA_ARGS__); \
+} while (0)
+
+#define RISCV_MUL(N, DEST, A, B, LEN) \
+    DEST##N = RVV_MUL(A, B[N], LEN);
+
+#define RISCV_ACC_MUL(N, DEST, A, B, LEN) \
+    DEST##N = RVV_MACC(DEST##N, B[N], A, LEN);
+
+#define RISCV_ACC_MUL_CONST(N, DEST, A, B, LEN) \
+    DEST##N = RVV_MACC(DEST##N, B, A##N, LEN);
+
+#define RISCV_LOAD(N, DEST, SRC, OFFSET, LDC, LEN) \
+    DEST##N = RVV_LOAD((SRC) + (OFFSET) + N*(LDC), LEN);
+
+#define RISCV_STORE(N, DEST, SRC, OFFSET, LDC, LEN) \
+    RVV_STORE((DEST) + (OFFSET) + N*(LDC), SRC##N, LEN);
+
+#define RISCV_LOAD_COLUMN(DEST, SRC, OFFSET, LEN) \
+    DEST = RVV_LOAD((SRC) + (OFFSET), LEN);
+
+#define COPY_ROW(DEST, SRC, OFFSET, LEN) \
+do { \
+    DEST[0] = SRC[OFFSET]; \
+    if (LEN == 1) break; \
+    DEST[1] = SRC[OFFSET + 1]; \
+    if (LEN == 2) break; \
+    DEST[2] = SRC[OFFSET + 2]; \
+    DEST[3] = SRC[OFFSET + 3]; \
+    if (LEN == 4) break; \
+    DEST[4] = SRC[OFFSET + 4]; \
+    DEST[5] = SRC[OFFSET + 5]; \
+    DEST[6] = SRC[OFFSET + 6]; \
+    DEST[7] = SRC[OFFSET + 7]; \
+} while (0)
+
+/* Perform matrix multiplication between submatrices:
+   A(m_size,K) * B(K,n_size) = C(m_size,n_size) */
+
+static inline __attribute__((always_inline))
+BLASLONG kernel (BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha,
+                 FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc,
+                 BLASLONG m_top, BLASLONG n_top,
+                 BLASLONG m_size, BLASLONG n_size)
+{
+    BLASLONG ai = m_top*K;
+    BLASLONG bi = n_top*K;
+
+    /* b[0..n_size-1] = B(0, n_top)..B(0, n_top+n_size-1) */
+    FLOAT b[N_BLOCKSIZE];
+    COPY_ROW (b, B, bi, n_size);
+    bi += n_size;
+
+    /* a[0..m_size-1] = A(m_top, 0)..A(m_top+m_size-1, 0) */
+    VECTOR_T a;
+    RISCV_LOAD_COLUMN (a, A, ai, m_size);
+    ai += m_size;
+
+    /* for I = 0..n_size-1
+         resultI[0..m_size-1] = A(m_top..m_top+msize-1, 0) * B(0, ntop+I) */
+    VECTOR_T result0, result1, result2, result3;
+    VECTOR_T result4, result5, result6, result7;
+    RISCV_REPEAT (RISCV_MUL, n_size, result, a, b, m_size);
+
+    for (BLASLONG k = 1; k < K; k++) {
+        /* b[0..n_size-1] = B(k, n_top)..B(k, n_top+n_size-1) */
+        COPY_ROW (b, B, bi, n_size);
+        bi += n_size;
+
+        /* a[0..m_size-1] = A(m_top, k)..A(m_top+m_size-1, k) */
+        RISCV_LOAD_COLUMN (a, A, ai, m_size);
+        ai += m_size;
+
+        /* for I = 0..n_size-1
+             resultI[0..m_size-1] += A(m_top..m_top+msize-1, k) * B(k, ntop+I) */
+        RISCV_REPEAT (RISCV_ACC_MUL, n_size, result, a, b, m_size);
+    }
+
+    BLASLONG ci = n_top * ldc + m_top;
+    VECTOR_T c0, c1, c2, c3, c4, c5, c6, c7;
+
+    /* for I = 0..nsize-1
+         cI[0..m_size-1] = C(m_top..m_top+m_size-1, n_top+I)
+         cI[0..m_size-1] += alpha * resultI[0..m_size-1]
+         C(mtop..m_top+m_size-1, n_top+I) = cI[0..m_size-1] */
+    RISCV_REPEAT (RISCV_LOAD, n_size, c, C, ci, ldc, m_size);
+    RISCV_REPEAT (RISCV_ACC_MUL_CONST, n_size, c, result, alpha, m_size);
+    RISCV_REPEAT (RISCV_STORE, n_size, C, c, ci, ldc, m_size);
+
+    return m_top + m_size;
+}
+
+/* Perform matrix multiplication between submatrices:
+   A(M,K) * B(K,n_size) = C(M, n_size) */
+
+static inline __attribute__((always_inline))
+BLASLONG kernel_column (BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha,
+                        FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc,
+                        BLASLONG n_top, BLASLONG n_size)
+{
+    BLASLONG m_top = 0;
+
+    for (BLASLONG i = 0; i < M / M_BLOCKSIZE; i++)
+      m_top = kernel (M, N, K, alpha, A, B, C, ldc, m_top, n_top, M_BLOCKSIZE, n_size);
+
+    if (M & (M_BLOCKSIZE - 1))
+        kernel (M, N, K, alpha, A, B, C, ldc, m_top, n_top, M - m_top, n_size);
+
+    return n_top + n_size;
+}
+
+#define xstr(s) str(s)
+#define str(s) #s
+
+/* Perform matrix multiplication between matrices:
+   A(M,K) * B(K,N) = C(M,N) */
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+{
+    //fprintf(stderr, "%s (with VLV): M=%ld, N=%ld, K=%ld, ldc=%ld, m_blocksize=%d, n_blocksize=%d\n", xstr(CNAME), M, N, K, ldc, M_BLOCKSIZE, N_BLOCKSIZE);
+    BLASLONG n_top = 0;
+
+    for (BLASLONG j = 0; j < N / N_BLOCKSIZE; j++)
+        n_top = kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, N_BLOCKSIZE);
+
+#if N_BLOCKSIZE > 4
+    if (N & 4)
+        n_top = kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, 4);
+#endif
+#if N_BLOCKSIZE > 2
+    if (N & 2)
+        n_top = kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, 2);
+#endif
+#if N_BLOCKSIZE > 1
+    if (N & 1)
+        kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, 1);
+#endif
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_tcopy_16_rvv_max.c b/kernel/riscv64/gemm_tcopy_16_rvv_max.c
new file mode 100644
index 0000000000..85975e28be
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_16_rvv_max.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define FLOAT_V_T               vfloat32m4_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m4
+#define VSEV_FLOAT              __riscv_vse32_v_f32m4
+#else
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#endif
+
+#define BLOCKSIZE 16
+
+#include "gemm_tcopy_rvv_max_common.h"
diff --git a/kernel/riscv64/gemm_tcopy_4_rvv_max.c b/kernel/riscv64/gemm_tcopy_4_rvv_max.c
new file mode 100644
index 0000000000..15af02bd86
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_4_rvv_max.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define FLOAT_V_T               vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#else
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#endif
+
+#define BLOCKSIZE 4
+
+#include "gemm_tcopy_rvv_max_common.h"
diff --git a/kernel/riscv64/gemm_tcopy_8_rvv_max.c b/kernel/riscv64/gemm_tcopy_8_rvv_max.c
new file mode 100644
index 0000000000..0aa0358369
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_8_rvv_max.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#else
+#define FLOAT_V_T               vfloat64m4_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VSEV_FLOAT              __riscv_vse64_v_f64m4
+#endif
+
+#define BLOCKSIZE 8
+
+#include "gemm_tcopy_rvv_max_common.h"
diff --git a/kernel/riscv64/gemm_tcopy_rvv_max_common.h b/kernel/riscv64/gemm_tcopy_rvv_max_common.h
new file mode 100644
index 0000000000..425650cb35
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_rvv_max_common.h
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define xstr(s) str(s)
+#define str(s) #s
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    IFLOAT *aoffset  = a;
+    IFLOAT *boffset  = b;
+    IFLOAT *boffset2 = b + m  * (n & ~(BLOCKSIZE - 1));
+
+    //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", xstr(CNAME), m, n, lda);
+
+    for(BLASLONG j = m; j > 0; j--) {
+        IFLOAT *aoffset1 = aoffset;
+        IFLOAT *boffset1 = boffset;
+
+        aoffset += lda;
+        boffset += BLOCKSIZE;
+
+        for(BLASLONG i = n / BLOCKSIZE; i > 0; i--) {
+            size_t vl = BLOCKSIZE;
+
+            FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset1, v, vl);
+
+            aoffset1 += BLOCKSIZE;
+            boffset1 += BLOCKSIZE * m;
+        }
+
+        if (n & (BLOCKSIZE - 1)) {
+            size_t vl = n & (BLOCKSIZE - 1);
+
+            FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset2, v, vl);
+
+            boffset2 += vl;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
deleted file mode 100644
index e22df34f99..0000000000
--- a/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
+++ /dev/null
@@ -1,1081 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Settings:
- LMUL=1
- M=16
- M_tail_scalar_from=2
- N=8
- __riscv_='__riscv_'
- complex=False
- conjugate=False
- cpu='zvl256b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='float'
- reg_width_bits=256
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=32
- ELEN_PARAM=32
- LMUL_ACC=1
- VFMACC='__riscv_vfmacc_vf_f32m1'
- VFMUL='__riscv_vfmul_vf_f32m1'
- VLEV='__riscv_vle32_v_f32m1'
- VLSEV='__riscv_vlse32_v_f32m1'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
- VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
- VSETVL='__riscv_vsetvl_e32m1'
- VSEV='__riscv_vse32_v_f32m1'
- VSSEV='__riscv_vsse32_v_f32m1'
- acc_vector_t='vfloat32m1_t'
- output='sgemm_kernel_16x8_zvl256b.c'
- param_scalar_t='float'
- param_vector_t='vfloat32m1_t'
-
-*/
-
-#include "common.h"
-
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-
-    // -- MAIN PASS
-
-    for (BLASLONG j=0; j<N/8; j+=1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
-
-
-        for (BLASLONG i=0; i<M/16; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            float B2 = B[bi+2];
-            float B3 = B[bi+3];
-            float B4 = B[bi+4];
-            float B5 = B[bi+5];
-            float B6 = B[bi+6];
-            float B7 = B[bi+7];
-            bi += 8;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-            ai += 16;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
-            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
-            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
-            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
-            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
-            vfloat32m1_t result8 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
-            vfloat32m1_t result9 = __riscv_vfmul_vf_f32m1( A1, B4, gvl);
-            vfloat32m1_t result10 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
-            vfloat32m1_t result11 = __riscv_vfmul_vf_f32m1( A1, B5, gvl);
-            vfloat32m1_t result12 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
-            vfloat32m1_t result13 = __riscv_vfmul_vf_f32m1( A1, B6, gvl);
-            vfloat32m1_t result14 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
-            vfloat32m1_t result15 = __riscv_vfmul_vf_f32m1( A1, B7, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                B4 = B[bi+4];
-                B5 = B[bi+5];
-                B6 = B[bi+6];
-                B7 = B[bi+7];
-                bi += 8;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-                ai += 16;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
-                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
-                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
-                result8 = __riscv_vfmacc_vf_f32m1( result8, B4, A0, gvl);
-                result9 = __riscv_vfmacc_vf_f32m1( result9, B4, A1, gvl);
-                result10 = __riscv_vfmacc_vf_f32m1( result10, B5, A0, gvl);
-                result11 = __riscv_vfmacc_vf_f32m1( result11, B5, A1, gvl);
-                result12 = __riscv_vfmacc_vf_f32m1( result12, B6, A0, gvl);
-                result13 = __riscv_vfmacc_vf_f32m1( result13, B6, A1, gvl);
-                result14 = __riscv_vfmacc_vf_f32m1( result14, B7, A0, gvl);
-                result15 = __riscv_vfmacc_vf_f32m1( result15, B7, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c8 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c9 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c10 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c11 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c12 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c13 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c14 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c15 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
-            c8 = __riscv_vfmacc_vf_f32m1( c8, alpha, result8, gvl );
-            c9 = __riscv_vfmacc_vf_f32m1( c9, alpha, result9, gvl );
-            c10 = __riscv_vfmacc_vf_f32m1( c10, alpha, result10, gvl );
-            c11 = __riscv_vfmacc_vf_f32m1( c11, alpha, result11, gvl );
-            c12 = __riscv_vfmacc_vf_f32m1( c12, alpha, result12, gvl );
-            c13 = __riscv_vfmacc_vf_f32m1( c13, alpha, result13, gvl );
-            c14 = __riscv_vfmacc_vf_f32m1( c14, alpha, result14, gvl );
-            c15 = __riscv_vfmacc_vf_f32m1( c15, alpha, result15, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c8, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c10, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c12, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c14, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c15, gvl);
-            m_top += 16;
-        }
-
-
-
-        // -- tails for main pass
-
-        if( M & 8 ) {
-            gvl = __riscv_vsetvl_e32m1(8);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            float B2 = B[bi+2];
-            float B3 = B[bi+3];
-            float B4 = B[bi+4];
-            float B5 = B[bi+5];
-            float B6 = B[bi+6];
-            float B7 = B[bi+7];
-            bi += 8;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 8;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
-            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
-            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
-            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
-            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                B4 = B[bi+4];
-                B5 = B[bi+5];
-                B6 = B[bi+6];
-                B7 = B[bi+7];
-                bi += 8;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
-                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
-                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            float B2 = B[bi+2];
-            float B3 = B[bi+3];
-            float B4 = B[bi+4];
-            float B5 = B[bi+5];
-            float B6 = B[bi+6];
-            float B7 = B[bi+7];
-            bi += 8;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
-            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
-            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
-            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
-            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                B4 = B[bi+4];
-                B5 = B[bi+5];
-                B6 = B[bi+6];
-                B7 = B[bi+7];
-                bi += 8;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
-                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
-                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            float result8 = 0;
-            float result9 = 0;
-            float result10 = 0;
-            float result11 = 0;
-            float result12 = 0;
-            float result13 = 0;
-            float result14 = 0;
-            float result15 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                result2+=A[ai+0]*B[bi+1];
-                result3+=A[ai+1]*B[bi+1];
-                result4+=A[ai+0]*B[bi+2];
-                result5+=A[ai+1]*B[bi+2];
-                result6+=A[ai+0]*B[bi+3];
-                result7+=A[ai+1]*B[bi+3];
-                result8+=A[ai+0]*B[bi+4];
-                result9+=A[ai+1]*B[bi+4];
-                result10+=A[ai+0]*B[bi+5];
-                result11+=A[ai+1]*B[bi+5];
-                result12+=A[ai+0]*B[bi+6];
-                result13+=A[ai+1]*B[bi+6];
-                result14+=A[ai+0]*B[bi+7];
-                result15+=A[ai+1]*B[bi+7];
-                ai+=2;
-                bi+=8;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            C[ci+1*ldc+0] += alpha * result2;
-            C[ci+1*ldc+1] += alpha * result3;
-            C[ci+2*ldc+0] += alpha * result4;
-            C[ci+2*ldc+1] += alpha * result5;
-            C[ci+3*ldc+0] += alpha * result6;
-            C[ci+3*ldc+1] += alpha * result7;
-            C[ci+4*ldc+0] += alpha * result8;
-            C[ci+4*ldc+1] += alpha * result9;
-            C[ci+5*ldc+0] += alpha * result10;
-            C[ci+5*ldc+1] += alpha * result11;
-            C[ci+6*ldc+0] += alpha * result12;
-            C[ci+6*ldc+1] += alpha * result13;
-            C[ci+7*ldc+0] += alpha * result14;
-            C[ci+7*ldc+1] += alpha * result15;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+0]*B[bi+1];
-                result2+=A[ai+0]*B[bi+2];
-                result3+=A[ai+0]*B[bi+3];
-                result4+=A[ai+0]*B[bi+4];
-                result5+=A[ai+0]*B[bi+5];
-                result6+=A[ai+0]*B[bi+6];
-                result7+=A[ai+0]*B[bi+7];
-                ai+=1;
-                bi+=8;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+1*ldc+0] += alpha * result1;
-            C[ci+2*ldc+0] += alpha * result2;
-            C[ci+3*ldc+0] += alpha * result3;
-            C[ci+4*ldc+0] += alpha * result4;
-            C[ci+5*ldc+0] += alpha * result5;
-            C[ci+6*ldc+0] += alpha * result6;
-            C[ci+7*ldc+0] += alpha * result7;
-            m_top+=1;
-        }
-
-        n_top += 8;
-    }
-
-
-
-    // -- tails for N=4
-
-    if( N & 4 ) {
-        gvl = __riscv_vsetvl_e32m1(8);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/16; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            float B2 = B[bi+2];
-            float B3 = B[bi+3];
-            bi += 4;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-            ai += 16;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
-            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
-            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
-            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
-            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                bi += 4;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-                ai += 16;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
-                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
-                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
-            m_top += 16;
-        }
-
-
-        if( M & 8 ) {
-            gvl = __riscv_vsetvl_e32m1(8);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            float B2 = B[bi+2];
-            float B3 = B[bi+3];
-            bi += 4;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 8;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                bi += 4;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            float B2 = B[bi+2];
-            float B3 = B[bi+3];
-            bi += 4;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                bi += 4;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                result2+=A[ai+0]*B[bi+1];
-                result3+=A[ai+1]*B[bi+1];
-                result4+=A[ai+0]*B[bi+2];
-                result5+=A[ai+1]*B[bi+2];
-                result6+=A[ai+0]*B[bi+3];
-                result7+=A[ai+1]*B[bi+3];
-                ai+=2;
-                bi+=4;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            C[ci+1*ldc+0] += alpha * result2;
-            C[ci+1*ldc+1] += alpha * result3;
-            C[ci+2*ldc+0] += alpha * result4;
-            C[ci+2*ldc+1] += alpha * result5;
-            C[ci+3*ldc+0] += alpha * result6;
-            C[ci+3*ldc+1] += alpha * result7;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+0]*B[bi+1];
-                result2+=A[ai+0]*B[bi+2];
-                result3+=A[ai+0]*B[bi+3];
-                ai+=1;
-                bi+=4;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+1*ldc+0] += alpha * result1;
-            C[ci+2*ldc+0] += alpha * result2;
-            C[ci+3*ldc+0] += alpha * result3;
-            m_top+=1;
-        }
-
-        n_top += 4;
-    }
-
-
-
-    // -- tails for N=2
-
-    if( N & 2 ) {
-        gvl = __riscv_vsetvl_e32m1(8);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/16; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            bi += 2;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-            ai += 16;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                bi += 2;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-                ai += 16;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
-            m_top += 16;
-        }
-
-
-        if( M & 8 ) {
-            gvl = __riscv_vsetvl_e32m1(8);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            bi += 2;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 8;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                bi += 2;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            bi += 2;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                bi += 2;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                result2+=A[ai+0]*B[bi+1];
-                result3+=A[ai+1]*B[bi+1];
-                ai+=2;
-                bi+=2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            C[ci+1*ldc+0] += alpha * result2;
-            C[ci+1*ldc+1] += alpha * result3;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+0]*B[bi+1];
-                ai+=1;
-                bi+=2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+1*ldc+0] += alpha * result1;
-            m_top+=1;
-        }
-
-        n_top += 2;
-    }
-
-
-
-    // -- tails for N=1
-
-    if( N & 1 ) {
-        gvl = __riscv_vsetvl_e32m1(8);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/16; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            bi += 1;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-            ai += 16;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                bi += 1;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-                ai += 16;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
-            m_top += 16;
-        }
-
-
-        if( M & 8 ) {
-            gvl = __riscv_vsetvl_e32m1(8);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            bi += 1;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 8;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                bi += 1;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            bi += 1;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                bi += 1;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            float result0 = 0;
-            float result1 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                ai+=2;
-                bi+=1;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                ai+=1;
-                bi+=1;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            m_top+=1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
-
diff --git a/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
deleted file mode 100644
index ad720e6949..0000000000
--- a/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
+++ /dev/null
@@ -1,791 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Script: ./kernel/riscv64/generate_kernel.py
-Settings:
- LMUL=2
- M=8
- M_tail_scalar_from=2
- N=8
- __riscv_='__riscv_'
- complex=False
- conjugate=False
- cpu='zvl128b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='float'
- reg_width_bits=128
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=32
- ELEN_PARAM=32
- LMUL_ACC=2
- VFMACC='__riscv_vfmacc_vf_f32m2'
- VFMUL='__riscv_vfmul_vf_f32m2'
- VLEV='__riscv_vle32_v_f32m2'
- VLSEV='__riscv_vlse32_v_f32m2'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
- VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
- VSETVL='__riscv_vsetvl_e32m2'
- VSEV='__riscv_vse32_v_f32m2'
- VSSEV='__riscv_vsse32_v_f32m2'
- acc_vector_t='vfloat32m2_t'
- output='sgemm_kernel_8x8_zvl128b.c'
- param_scalar_t='float'
- param_vector_t='vfloat32m2_t'
-
-*/
-
-#include "common.h"
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-    // -- MAIN PASS
-
-    for (BLASLONG j = 0; j < N / 8; j += 1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            float B1 = B[bi + 1];
-            float B2 = B[bi + 2];
-            float B3 = B[bi + 3];
-            float B4 = B[bi + 4];
-            float B5 = B[bi + 5];
-            float B6 = B[bi + 6];
-            float B7 = B[bi + 7];
-            bi += 8;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
-            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
-            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
-            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
-            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
-            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
-            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                B2 = B[bi + 2];
-                B3 = B[bi + 3];
-                B4 = B[bi + 4];
-                B5 = B[bi + 5];
-                B6 = B[bi + 6];
-                B7 = B[bi + 7];
-                bi += 8;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
-                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
-                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
-            c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
-            c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
-            c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
-            c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
-            m_top += 8;
-        }
-
-        // -- tails for main pass
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            float B1 = B[bi + 1];
-            float B2 = B[bi + 2];
-            float B3 = B[bi + 3];
-            float B4 = B[bi + 4];
-            float B5 = B[bi + 5];
-            float B6 = B[bi + 6];
-            float B7 = B[bi + 7];
-            bi += 8;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
-            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
-            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
-            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
-            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
-            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
-            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                B2 = B[bi + 2];
-                B3 = B[bi + 3];
-                B4 = B[bi + 4];
-                B5 = B[bi + 5];
-                B6 = B[bi + 6];
-                B7 = B[bi + 7];
-                bi += 8;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
-                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
-                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
-            c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
-            c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
-            c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
-            c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            float result8 = 0;
-            float result9 = 0;
-            float result10 = 0;
-            float result11 = 0;
-            float result12 = 0;
-            float result13 = 0;
-            float result14 = 0;
-            float result15 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                result2 += A[ai + 0] * B[bi + 1];
-                result3 += A[ai + 1] * B[bi + 1];
-                result4 += A[ai + 0] * B[bi + 2];
-                result5 += A[ai + 1] * B[bi + 2];
-                result6 += A[ai + 0] * B[bi + 3];
-                result7 += A[ai + 1] * B[bi + 3];
-                result8 += A[ai + 0] * B[bi + 4];
-                result9 += A[ai + 1] * B[bi + 4];
-                result10 += A[ai + 0] * B[bi + 5];
-                result11 += A[ai + 1] * B[bi + 5];
-                result12 += A[ai + 0] * B[bi + 6];
-                result13 += A[ai + 1] * B[bi + 6];
-                result14 += A[ai + 0] * B[bi + 7];
-                result15 += A[ai + 1] * B[bi + 7];
-                ai += 2;
-                bi += 8;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            C[ci + 1 * ldc + 0] += alpha * result2;
-            C[ci + 1 * ldc + 1] += alpha * result3;
-            C[ci + 2 * ldc + 0] += alpha * result4;
-            C[ci + 2 * ldc + 1] += alpha * result5;
-            C[ci + 3 * ldc + 0] += alpha * result6;
-            C[ci + 3 * ldc + 1] += alpha * result7;
-            C[ci + 4 * ldc + 0] += alpha * result8;
-            C[ci + 4 * ldc + 1] += alpha * result9;
-            C[ci + 5 * ldc + 0] += alpha * result10;
-            C[ci + 5 * ldc + 1] += alpha * result11;
-            C[ci + 6 * ldc + 0] += alpha * result12;
-            C[ci + 6 * ldc + 1] += alpha * result13;
-            C[ci + 7 * ldc + 0] += alpha * result14;
-            C[ci + 7 * ldc + 1] += alpha * result15;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 0] * B[bi + 1];
-                result2 += A[ai + 0] * B[bi + 2];
-                result3 += A[ai + 0] * B[bi + 3];
-                result4 += A[ai + 0] * B[bi + 4];
-                result5 += A[ai + 0] * B[bi + 5];
-                result6 += A[ai + 0] * B[bi + 6];
-                result7 += A[ai + 0] * B[bi + 7];
-                ai += 1;
-                bi += 8;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 1 * ldc + 0] += alpha * result1;
-            C[ci + 2 * ldc + 0] += alpha * result2;
-            C[ci + 3 * ldc + 0] += alpha * result3;
-            C[ci + 4 * ldc + 0] += alpha * result4;
-            C[ci + 5 * ldc + 0] += alpha * result5;
-            C[ci + 6 * ldc + 0] += alpha * result6;
-            C[ci + 7 * ldc + 0] += alpha * result7;
-            m_top += 1;
-        }
-
-        n_top += 8;
-    }
-
-    // -- tails for N=4
-
-    if (N & 4) {
-        gvl = __riscv_vsetvl_e32m2(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            float B1 = B[bi + 1];
-            float B2 = B[bi + 2];
-            float B3 = B[bi + 3];
-            bi += 4;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
-            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
-            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                B2 = B[bi + 2];
-                B3 = B[bi + 3];
-                bi += 4;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            float B1 = B[bi + 1];
-            float B2 = B[bi + 2];
-            float B3 = B[bi + 3];
-            bi += 4;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
-            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
-            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                B2 = B[bi + 2];
-                B3 = B[bi + 3];
-                bi += 4;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                result2 += A[ai + 0] * B[bi + 1];
-                result3 += A[ai + 1] * B[bi + 1];
-                result4 += A[ai + 0] * B[bi + 2];
-                result5 += A[ai + 1] * B[bi + 2];
-                result6 += A[ai + 0] * B[bi + 3];
-                result7 += A[ai + 1] * B[bi + 3];
-                ai += 2;
-                bi += 4;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            C[ci + 1 * ldc + 0] += alpha * result2;
-            C[ci + 1 * ldc + 1] += alpha * result3;
-            C[ci + 2 * ldc + 0] += alpha * result4;
-            C[ci + 2 * ldc + 1] += alpha * result5;
-            C[ci + 3 * ldc + 0] += alpha * result6;
-            C[ci + 3 * ldc + 1] += alpha * result7;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 0] * B[bi + 1];
-                result2 += A[ai + 0] * B[bi + 2];
-                result3 += A[ai + 0] * B[bi + 3];
-                ai += 1;
-                bi += 4;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 1 * ldc + 0] += alpha * result1;
-            C[ci + 2 * ldc + 0] += alpha * result2;
-            C[ci + 3 * ldc + 0] += alpha * result3;
-            m_top += 1;
-        }
-
-        n_top += 4;
-    }
-
-    // -- tails for N=2
-
-    if (N & 2) {
-        gvl = __riscv_vsetvl_e32m2(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            float B1 = B[bi + 1];
-            bi += 2;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                bi += 2;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            float B1 = B[bi + 1];
-            bi += 2;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                bi += 2;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                result2 += A[ai + 0] * B[bi + 1];
-                result3 += A[ai + 1] * B[bi + 1];
-                ai += 2;
-                bi += 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            C[ci + 1 * ldc + 0] += alpha * result2;
-            C[ci + 1 * ldc + 1] += alpha * result3;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            float result1 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 0] * B[bi + 1];
-                ai += 1;
-                bi += 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 1 * ldc + 0] += alpha * result1;
-            m_top += 1;
-        }
-
-        n_top += 2;
-    }
-
-    // -- tails for N=1
-
-    if (N & 1) {
-        gvl = __riscv_vsetvl_e32m2(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            bi += 1;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                bi += 1;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            bi += 1;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                bi += 1;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                ai += 2;
-                bi += 1;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                ai += 1;
-                bi += 1;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            m_top += 1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
diff --git a/kernel/riscv64/sgemm_kernel_zvl128b.c b/kernel/riscv64/sgemm_kernel_zvl128b.c
new file mode 100644
index 0000000000..cfd27662e9
--- /dev/null
+++ b/kernel/riscv64/sgemm_kernel_zvl128b.c
@@ -0,0 +1,55 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE SGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE SGEMM_DEFAULT_UNROLL_N
+
+#if M_BLOCKSIZE == 4
+#define RVV_MUL __riscv_vfmul_vf_f32m1
+#define RVV_MACC __riscv_vfmacc_vf_f32m1
+#define RVV_LOAD __riscv_vle32_v_f32m1
+#define RVV_STORE __riscv_vse32_v_f32m1
+#define VECTOR_T vfloat32m1_t
+#elif M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f32m2
+#define RVV_MACC __riscv_vfmacc_vf_f32m2
+#define RVV_LOAD __riscv_vle32_v_f32m2
+#define RVV_STORE __riscv_vse32_v_f32m2
+#define VECTOR_T vfloat32m2_t
+#elif M_BLOCKSIZE == 16
+#define RVV_MUL __riscv_vfmul_vf_f32m4
+#define RVV_MACC __riscv_vfmacc_vf_f32m4
+#define RVV_LOAD __riscv_vle32_v_f32m4
+#define RVV_STORE __riscv_vse32_v_f32m4
+#define VECTOR_T vfloat32m4_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "gemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/sgemm_kernel_zvl256b.c b/kernel/riscv64/sgemm_kernel_zvl256b.c
new file mode 100644
index 0000000000..316adbbca5
--- /dev/null
+++ b/kernel/riscv64/sgemm_kernel_zvl256b.c
@@ -0,0 +1,49 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE SGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE SGEMM_DEFAULT_UNROLL_N
+
+#if M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f32m1
+#define RVV_MACC __riscv_vfmacc_vf_f32m1
+#define RVV_LOAD __riscv_vle32_v_f32m1
+#define RVV_STORE __riscv_vse32_v_f32m1
+#define VECTOR_T vfloat32m1_t
+#elif M_BLOCKSIZE == 16
+#define RVV_MUL __riscv_vfmul_vf_f32m2
+#define RVV_MACC __riscv_vfmacc_vf_f32m2
+#define RVV_LOAD __riscv_vle32_v_f32m2
+#define RVV_STORE __riscv_vse32_v_f32m2
+#define VECTOR_T vfloat32m2_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "gemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c
deleted file mode 100644
index 0776f03fdd..0000000000
--- a/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c
+++ /dev/null
@@ -1,720 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Script: ./kernel/riscv64/generate_kernel.py
-Settings:
- LMUL=2
- M=4
- M_tail_scalar_from=2
- N=4
- __riscv_='__riscv_'
- complex=True
- conjugate=False
- cpu='zvl128b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='double'
- reg_width_bits=128
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=64
- ELEN_PARAM=64
- LMUL_ACC=2
- VFMACC='__riscv_vfmacc_vf_f64m2'
- VFMUL='__riscv_vfmul_vf_f64m2'
- VLEV='__riscv_vle64_v_f64m2'
- VLSEV='__riscv_vlse64_v_f64m2'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2'
- VMUL_TO_ACC='__riscv_vfmul_vf_f64m2'
- VSETVL='__riscv_vsetvl_e64m2'
- VSEV='__riscv_vse64_v_f64m2'
- VSSEV='__riscv_vsse64_v_f64m2'
- acc_vector_t='vfloat64m2_t'
- output='zgemm_kernel_4x4_zvl128b.c'
- param_scalar_t='double'
- param_vector_t='vfloat64m2_t'
-
-*/
-
-#include "common.h"
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
-#define S0 1
-#define S1 -1
-#define S2 1
-#define S3 1
-#define VFMACC_RR __riscv_vfmsac
-#define VFMACC_RI __riscv_vfmacc
-#endif
-#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
-#define S0 1
-#define S1 1
-#define S2 1
-#define S3 -1
-#define VFMACC_RR __riscv_vfmacc
-#define VFMACC_RI __riscv_vfmsac
-#endif
-#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
-#define S0 1
-#define S1 1
-#define S2 -1
-#define S3 1
-#define VFMACC_RR __riscv_vfmacc
-#define VFMACC_RI __riscv_vfnmsac
-#endif
-#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
-#define S0 1
-#define S1 -1
-#define S2 -1
-#define S3 -1
-#define VFMACC_RR __riscv_vfmsac
-#define VFMACC_RI __riscv_vfnmacc
-#endif
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-    // -- MAIN PASS
-
-    for (BLASLONG j = 0; j < N / 4; j += 1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e64m2(4);
-
-        for (BLASLONG i = 0; i < M / 4; i += 1) {
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            double B0r = B[bi + 0 * 2 + 0];
-            double B0i = B[bi + 0 * 2 + 1];
-            double B1r = B[bi + 1 * 2 + 0];
-            double B1i = B[bi + 1 * 2 + 1];
-            double B2r = B[bi + 2 * 2 + 0];
-            double B2i = B[bi + 2 * 2 + 1];
-            double B3r = B[bi + 3 * 2 + 0];
-            double B3i = B[bi + 3 * 2 + 1];
-            bi += 4 * 2;
-
-            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 4 * 2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 6 vector registers for temporaries
-            // performing 2 operations between reuses of temporaries
-            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
-            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
-            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
-            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-            vfloat64m2_t ACC0r = tmp0r;
-            vfloat64m2_t ACC0i = tmp0i;
-            vfloat64m2_t ACC1r = tmp1r;
-            vfloat64m2_t ACC1i = tmp1i;
-            tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
-            tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
-            tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
-            tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
-            vfloat64m2_t ACC2r = tmp0r;
-            vfloat64m2_t ACC2i = tmp0i;
-            vfloat64m2_t ACC3r = tmp1r;
-            vfloat64m2_t ACC3i = tmp1i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                B1r = B[bi + 1 * 2 + 0];
-                B1i = B[bi + 1 * 2 + 1];
-                B2r = B[bi + 2 * 2 + 0];
-                B2i = B[bi + 2 * 2 + 1];
-                B3r = B[bi + 3 * 2 + 0];
-                B3i = B[bi + 3 * 2 + 1];
-                bi += 4 * 2;
-
-                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 4 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
-                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
-                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
-                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
-                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
-                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m2_t C2r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C2i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m2_t C3r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C3i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
-            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
-            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
-            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
-            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
-            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
-            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
-            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
-            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
-            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
-            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
-
-            m_top += 4;
-        }
-
-        // -- tails for main pass
-
-        if (M & 2) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            double result8 = 0;
-            double result9 = 0;
-            double result10 = 0;
-            double result11 = 0;
-            double result12 = 0;
-            double result13 = 0;
-            double result14 = 0;
-            double result15 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
-                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
-                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
-                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
-                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
-                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
-                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
-                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
-                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
-                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
-                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
-                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
-                ai += 2 * 2;
-                bi += 4 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            double Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result4 * alphar;
-            Ci += result5 * alphar;
-            Cr -= result5 * alphai;
-            Ci += result4 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
-            Cr += result6 * alphar;
-            Ci += result7 * alphar;
-            Cr -= result7 * alphai;
-            Ci += result6 * alphai;
-            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
-            Cr += result8 * alphar;
-            Ci += result9 * alphar;
-            Cr -= result9 * alphai;
-            Ci += result8 * alphai;
-            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 2 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 2 * ldc + 1) * 2 + 1];
-            Cr += result10 * alphar;
-            Ci += result11 * alphar;
-            Cr -= result11 * alphai;
-            Ci += result10 * alphai;
-            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
-            Cr += result12 * alphar;
-            Ci += result13 * alphar;
-            Cr -= result13 * alphai;
-            Ci += result12 * alphai;
-            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 3 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 3 * ldc + 1) * 2 + 1];
-            Cr += result14 * alphar;
-            Ci += result15 * alphar;
-            Cr -= result15 * alphai;
-            Ci += result14 * alphai;
-            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
-                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
-                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
-                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
-                ai += 1 * 2;
-                bi += 4 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            double Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
-            Cr += result4 * alphar;
-            Ci += result5 * alphar;
-            Cr -= result5 * alphai;
-            Ci += result4 * alphai;
-            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
-            Cr += result6 * alphar;
-            Ci += result7 * alphar;
-            Cr -= result7 * alphai;
-            Ci += result6 * alphai;
-            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
-            m_top += 1;
-        }
-
-        n_top += 4;
-    }
-
-    // -- tails for N=2
-
-    if (N & 2) {
-        gvl = __riscv_vsetvl_e64m2(4);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 4; i += 1) {
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            double B0r = B[bi + 0 * 2 + 0];
-            double B0i = B[bi + 0 * 2 + 1];
-            double B1r = B[bi + 1 * 2 + 0];
-            double B1i = B[bi + 1 * 2 + 1];
-            bi += 2 * 2;
-
-            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 4 * 2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 10 vector registers for temporaries
-            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
-            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
-            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
-            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-            vfloat64m2_t ACC0r = tmp0r;
-            vfloat64m2_t ACC0i = tmp0i;
-            vfloat64m2_t ACC1r = tmp1r;
-            vfloat64m2_t ACC1i = tmp1i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                B1r = B[bi + 1 * 2 + 0];
-                B1i = B[bi + 1 * 2 + 1];
-                bi += 2 * 2;
-
-                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 4 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
-            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
-            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
-
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
-                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
-                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
-                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
-                ai += 2 * 2;
-                bi += 2 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            double Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result4 * alphar;
-            Ci += result5 * alphar;
-            Cr -= result5 * alphai;
-            Ci += result4 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
-            Cr += result6 * alphar;
-            Ci += result7 * alphar;
-            Cr -= result7 * alphai;
-            Ci += result6 * alphai;
-            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                ai += 1 * 2;
-                bi += 2 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            double Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            m_top += 1;
-        }
-
-        n_top += 2;
-    }
-
-    // -- tails for N=1
-
-    if (N & 1) {
-        gvl = __riscv_vsetvl_e64m2(4);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 4; i += 1) {
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            double B0r = B[bi + 0 * 2 + 0];
-            double B0i = B[bi + 0 * 2 + 1];
-            bi += 1 * 2;
-
-            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 4 * 2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 12 vector registers for temporaries
-            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
-            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            vfloat64m2_t ACC0r = tmp0r;
-            vfloat64m2_t ACC0i = tmp0i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                bi += 1 * 2;
-
-                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 4 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
-                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
-                ai += 2 * 2;
-                bi += 1 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            double Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            double result0 = 0;
-            double result1 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                ai += 1 * 2;
-                bi += 1 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            double Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            m_top += 1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
diff --git a/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c b/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c
deleted file mode 100644
index ca33368f00..0000000000
--- a/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c
+++ /dev/null
@@ -1,1253 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Settings:
- LMUL=1
- M=8
- M_tail_scalar_from=1
- N=4
- __riscv_='__riscv_'
- complex=True
- conjugate=False
- cpu='zvl256b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='double'
- reg_width_bits=256
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=64
- ELEN_PARAM=64
- LMUL_ACC=1
- VFMACC='__riscv_vfmacc_vf_f64m1'
- VFMUL='__riscv_vfmul_vf_f64m1'
- VLEV='__riscv_vle64_v_f64m1'
- VLSEV='__riscv_vlse64_v_f64m1'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
- VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
- VSETVL='__riscv_vsetvl_e64m1'
- VSEV='__riscv_vse64_v_f64m1'
- VSSEV='__riscv_vsse64_v_f64m1'
- acc_vector_t='vfloat64m1_t'
- output='zgemm_kernel_8x4_zvl256b.c'
- param_scalar_t='double'
- param_vector_t='vfloat64m1_t'
-
-*/
-
-#include "common.h"
-
-
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-    #define S0  1
-    #define S1 -1
-    #define S2  1
-    #define S3  1
-    #define VFMACC_RR __riscv_vfmsac
-    #define VFMACC_RI __riscv_vfmacc
-#endif
-#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
-    #define S0  1
-    #define S1  1
-    #define S2  1
-    #define S3 -1
-    #define VFMACC_RR __riscv_vfmacc
-    #define VFMACC_RI __riscv_vfmsac
-#endif
-#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
-    #define S0  1
-    #define S1  1
-    #define S2 -1
-    #define S3  1
-    #define VFMACC_RR __riscv_vfmacc
-    #define VFMACC_RI __riscv_vfnmsac
-#endif
-#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
-    #define S0  1
-    #define S1 -1
-    #define S2 -1
-    #define S3 -1
-    #define VFMACC_RR __riscv_vfmsac
-    #define VFMACC_RI __riscv_vfnmacc
-#endif
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-
-    // -- MAIN PASS
-
-    for (BLASLONG j=0; j<N/4; j+=1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
-
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            double B1r = B[bi+1*2+0];
-            double B1i = B[bi+1*2+1];
-            double B2r = B[bi+2*2+0];
-            double B2i = B[bi+2*2+1];
-            double B3r = B[bi+3*2+0];
-            double B3i = B[bi+3*2+1];
-            bi += 4*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 4 vector regs to hold A array contents, 16 regs to hold values accumulated over k
-            // leaving 12 vector registers for temporaries
-            // performing 4 operations between reuses of temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
-            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
-            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-            vfloat64m1_t ACC2r = tmp2r;
-            vfloat64m1_t ACC2i = tmp2i;
-            vfloat64m1_t ACC3r = tmp3r;
-            vfloat64m1_t ACC3i = tmp3i;
-            tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
-            tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
-            tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
-            tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
-            tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
-            tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
-            tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
-            tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
-            vfloat64m1_t ACC4r = tmp0r;
-            vfloat64m1_t ACC4i = tmp0i;
-            vfloat64m1_t ACC5r = tmp1r;
-            vfloat64m1_t ACC5i = tmp1i;
-            vfloat64m1_t ACC6r = tmp2r;
-            vfloat64m1_t ACC6i = tmp2i;
-            vfloat64m1_t ACC7r = tmp3r;
-            vfloat64m1_t ACC7i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                bi += 4*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
-                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
-                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
-                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
-                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
-                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
-                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
-                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
-                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
-                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
-                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
-                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
-                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
-                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
-                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*1;
-            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*1;
-            vfloat64m1_t C4r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C4i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C5r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C5i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*1;
-            vfloat64m1_t C6r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C6i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C7r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C7i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
-            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
-            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
-            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
-            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
-            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
-            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
-            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
-            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
-            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
-            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
-            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
-            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
-            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
-            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*1;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            ci += ldc-gvl*1;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
-            ci += ldc-gvl*1;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
-            
-            m_top += 8;
-        }
-
-
-
-        // -- tails for main pass
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            double B1r = B[bi+1*2+0];
-            double B1i = B[bi+1*2+1];
-            double B2r = B[bi+2*2+0];
-            double B2i = B[bi+2*2+1];
-            double B3r = B[bi+3*2+0];
-            double B3i = B[bi+3*2+1];
-            bi += 4*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 22 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
-            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
-            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
-            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-            vfloat64m1_t ACC2r = tmp2r;
-            vfloat64m1_t ACC2i = tmp2i;
-            vfloat64m1_t ACC3r = tmp3r;
-            vfloat64m1_t ACC3i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                bi += 4*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e64m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            double B1r = B[bi+1*2+0];
-            double B1i = B[bi+1*2+1];
-            double B2r = B[bi+2*2+0];
-            double B2i = B[bi+2*2+1];
-            double B3r = B[bi+3*2+0];
-            double B3i = B[bi+3*2+1];
-            bi += 4*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 22 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
-            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
-            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
-            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-            vfloat64m1_t ACC2r = tmp2r;
-            vfloat64m1_t ACC2i = tmp2i;
-            vfloat64m1_t ACC3r = tmp3r;
-            vfloat64m1_t ACC3i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                bi += 4*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
-                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
-                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
-                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
-                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
-                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
-                ai+=1*2;
-                bi+=4*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            double Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+1*ldc+0)*2+0];
-            Ci = C[(ci+1*ldc+0)*2+1];
-            Cr += result2*alphar;
-            Ci += result3*alphar;
-            Cr -= result3*alphai;
-            Ci += result2*alphai;
-            C[(ci+1*ldc+0)*2+0] = Cr;
-            C[(ci+1*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+2*ldc+0)*2+0];
-            Ci = C[(ci+2*ldc+0)*2+1];
-            Cr += result4*alphar;
-            Ci += result5*alphar;
-            Cr -= result5*alphai;
-            Ci += result4*alphai;
-            C[(ci+2*ldc+0)*2+0] = Cr;
-            C[(ci+2*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+3*ldc+0)*2+0];
-            Ci = C[(ci+3*ldc+0)*2+1];
-            Cr += result6*alphar;
-            Ci += result7*alphar;
-            Cr -= result7*alphai;
-            Ci += result6*alphai;
-            C[(ci+3*ldc+0)*2+0] = Cr;
-            C[(ci+3*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 4;
-    }
-
-
-
-    // -- tails for N=2
-
-    if( N & 2 ) {
-        gvl = __riscv_vsetvl_e64m1(4);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            double B1r = B[bi+1*2+0];
-            double B1i = B[bi+1*2+1];
-            bi += 2*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 4 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 20 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
-            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
-            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-            vfloat64m1_t ACC2r = tmp2r;
-            vfloat64m1_t ACC2i = tmp2i;
-            vfloat64m1_t ACC3r = tmp3r;
-            vfloat64m1_t ACC3i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                bi += 2*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
-                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
-                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
-                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*1;
-            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*1;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            double B1r = B[bi+1*2+0];
-            double B1i = B[bi+1*2+1];
-            bi += 2*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 26 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                bi += 2*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e64m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            double B1r = B[bi+1*2+0];
-            double B1i = B[bi+1*2+1];
-            bi += 2*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 26 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                bi += 2*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
-                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
-                ai+=1*2;
-                bi+=2*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            double Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+1*ldc+0)*2+0];
-            Ci = C[(ci+1*ldc+0)*2+1];
-            Cr += result2*alphar;
-            Ci += result3*alphar;
-            Cr -= result3*alphai;
-            Ci += result2*alphai;
-            C[(ci+1*ldc+0)*2+0] = Cr;
-            C[(ci+1*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 2;
-    }
-
-
-
-    // -- tails for N=1
-
-    if( N & 1 ) {
-        gvl = __riscv_vsetvl_e64m1(4);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            bi += 1*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 4 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 24 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                bi += 1*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
-                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            bi += 1*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 28 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                bi += 1*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e64m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            bi += 1*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 28 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                bi += 1*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            double result1 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                ai+=1*2;
-                bi+=1*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            double Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
-
diff --git a/kernel/riscv64/zgemm_kernel_rvv_vlv_common.h b/kernel/riscv64/zgemm_kernel_rvv_vlv_common.h
new file mode 100644
index 0000000000..02f443f711
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_rvv_vlv_common.h
@@ -0,0 +1,268 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+#define RISCV_REPEAT_1(INSN, BLEN, ...) \
+do { \
+    INSN (0, 0, __VA_ARGS__); \
+    if (BLEN == 1) break; \
+    INSN (1, 1, __VA_ARGS__); \
+    if (BLEN == 2) break; \
+    INSN (2, 2, __VA_ARGS__); \
+    INSN (3, 3, __VA_ARGS__); \
+} while (0)
+
+#define RISCV_REPEAT_2(INSN, BLEN, ...) \
+do { \
+    if (BLEN <= 4) break; \
+    INSN (0, 4, __VA_ARGS__); \
+    INSN (1, 5, __VA_ARGS__); \
+    INSN (2, 6, __VA_ARGS__); \
+    INSN (3, 7, __VA_ARGS__); \
+} while (0)
+
+#define RISCV_MUL(M, N, DESTr, DESTi, Ar, Ai, Bi, GVL) \
+    DESTr##M = RVV_MUL(Ai, Bi[N], GVL); \
+    DESTi##M = RVV_MUL(Ar, Bi[N], GVL);
+
+#define RISCV_VFMACC(M, N, DESTr, DESTi, Ar, Ai, Br, GVL) \
+    DESTr##M = VFMACC_RR(DESTr##M, Br[N], Ar, GVL); \
+    DESTi##M = VFMACC_RI(DESTi##M, Br[N], Ai, GVL);
+
+#define RISCV_ACC_MUL_CONSTR(M, N, DESTr, DESTi, Ar, Ai, B, GVL) \
+    DESTr##N = __riscv_vfmacc(DESTr##N, B, Ar##N, GVL); \
+    DESTi##N = __riscv_vfmacc(DESTi##N, B, Ai##N, GVL);
+
+#define RISCV_ACC_MUL_CONSTI(M, N, DESTr, DESTi, Ar, Ai, B, GVL) \
+    DESTr##N = __riscv_vfnmsac(DESTr##N, B, Ai##N, GVL); \
+    DESTi##N = __riscv_vfmacc(DESTi##N, B, Ar##N, GVL);
+
+#define RISCV_LOAD(M, N, DESTr, DESTi, SRC, OFFSET, LDC, GVL) \
+    DESTr##N = RVV_LOAD((SRC) + ((OFFSET) + N*(LDC)) * 2, sizeof(FLOAT)*2, GVL); \
+    DESTi##N = RVV_LOAD((SRC) + ((OFFSET) + N*(LDC)) * 2 + 1, sizeof(FLOAT)*2, GVL);
+
+#define RISCV_STORE(M, N, DEST, SRCr, SRCi, OFFSET, LDC, GVL) \
+    RVV_STORE((DEST) + ((OFFSET) + N*(LDC)) * 2, sizeof(FLOAT)*2, SRCr##N, GVL); \
+    RVV_STORE((DEST) + ((OFFSET) + N*(LDC)) * 2 + 1, sizeof(FLOAT)*2, SRCi##N, GVL);
+
+#define RISCV_LOAD_COLUMN(DESTR, DESTI, SRC, OFFSET, GVL) \
+    DESTR = RVV_LOAD((SRC) + (OFFSET), sizeof (FLOAT)*2, GVL); \
+    DESTI = RVV_LOAD((SRC) + (OFFSET) + 1, sizeof (FLOAT)*2, GVL);
+
+#define COPY_TMP(M, N, DESTr, DESTi, SRCr, SRCi) \
+    DESTr##N = SRCr##M; \
+    DESTi##N = SRCi##M;
+
+#define RISCV_ADD(M, N, DESTr, DESTi, SRCr, SRCi, GVL) \
+    DESTr##N = __riscv_vfadd(DESTr##N, SRCr##M, GVL); \
+    DESTi##N = __riscv_vfadd(DESTi##N, SRCi##M, GVL);
+
+#define COPY_ROW(DESTR, DESTI, SRC, OFFSET, LEN) \
+do { \
+    DESTR[0] = SRC[OFFSET]; \
+    DESTI[0] = SRC[OFFSET + 1]; \
+    if (LEN == 1) break; \
+    DESTR[1] = SRC[OFFSET + 2]; \
+    DESTI[1] = SRC[OFFSET + 3]; \
+    if (LEN == 2) break; \
+    DESTR[2] = SRC[OFFSET + 4]; \
+    DESTI[2] = SRC[OFFSET + 5]; \
+    DESTR[3] = SRC[OFFSET + 6]; \
+    DESTI[3] = SRC[OFFSET + 7]; \
+    if (LEN == 4) break; \
+    DESTR[4] = SRC[OFFSET + 8]; \
+    DESTI[4] = SRC[OFFSET + 9]; \
+    DESTR[5] = SRC[OFFSET + 10]; \
+    DESTI[5] = SRC[OFFSET + 11]; \
+    DESTR[6] = SRC[OFFSET + 12]; \
+    DESTI[6] = SRC[OFFSET + 13]; \
+    DESTR[7] = SRC[OFFSET + 14]; \
+    DESTI[7] = SRC[OFFSET + 15]; \
+} while (0)
+
+/* Perform matrix multiplication between submatrices:
+   A(m_size,K) * B(K,n_size) = C(m_size,n_size) */
+
+static inline __attribute__((always_inline))
+BLASLONG kernel (BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai,
+                 FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc,
+                 BLASLONG m_top, BLASLONG n_top,
+                 BLASLONG m_size, BLASLONG n_size)
+{
+    BLASLONG ai = m_top*K*2;
+    BLASLONG bi = n_top*K*2;
+
+    /* b_r[0..n_size-1] = real(B(0, n_top)..B(0, n_top+n_size-1))
+       b_i[0..n_size-1] = imag(B(0, n_top)..B(0, n_top+n_size-1)) */
+    FLOAT b_r[N_BLOCKSIZE], b_i[N_BLOCKSIZE];
+    COPY_ROW (b_r, b_i, B, bi, n_size);
+    bi += n_size * 2;
+
+    /* a_r[0..m_size-1] = real(A(m_top, 0)..A(m_top+m_size-1, 0))
+       a_i[0..m_size-1] = imag(A(m_top, 0)..A(m_top+m_size-1, 0)) */
+    VECTOR_T a_r, a_i;
+    RISCV_LOAD_COLUMN (a_r, a_i, A, ai, m_size);
+    ai += m_size * 2;
+
+    /* for I = 0..n_size-1
+         acc_rI[0..m_size-1] = real(A(m_top..m_top+msize-1, 0) * B(0, ntop+I))
+         acc_iI[0..m_size-1] = imag(A(m_top..m_top+msize-1, 0) * B(0, ntop+I)) */
+    VECTOR_T tmp_r0, tmp_i0, tmp_r1, tmp_i1, tmp_r2, tmp_i2, tmp_r3, tmp_i3;
+    VECTOR_T acc_r0, acc_i0, acc_r1, acc_i1, acc_r2, acc_i2, acc_r3, acc_i3;
+    VECTOR_T acc_r4, acc_i4, acc_r5, acc_i5, acc_r6, acc_i6, acc_r7, acc_i7;
+    RISCV_REPEAT_1 (RISCV_MUL, n_size, tmp_r, tmp_i, a_r, a_i, b_i, m_size);
+    RISCV_REPEAT_1 (RISCV_VFMACC, n_size, tmp_r, tmp_i, a_r, a_i, b_r, m_size);
+    RISCV_REPEAT_1 (COPY_TMP, n_size, acc_r, acc_i, tmp_r, tmp_i);
+    RISCV_REPEAT_2 (RISCV_MUL, n_size, tmp_r, tmp_i, a_r, a_i, b_i, m_size);
+    RISCV_REPEAT_2 (RISCV_VFMACC, n_size, tmp_r, tmp_i, a_r, a_i, b_r, m_size);
+    RISCV_REPEAT_2 (COPY_TMP, n_size, acc_r, acc_i, tmp_r, tmp_i);
+
+    for (BLASLONG k = 1; k < K; k++) {
+        /* b_r[0..n_size-1] = real(B(k, n_top)..B(k, n_top+n_size-1))
+           b_i[0..n_size-1] = imag(B(k, n_top)..B(k, n_top+n_size-1)) */
+        COPY_ROW (b_r, b_i, B, bi, n_size);
+        bi += n_size * 2;
+
+        /* a_r[0..m_size-1] = real(A(m_top, k)..A(m_top+m_size-1, k))
+           a_i[0..m_size-1] = imag(A(m_top, k)..A(m_top+m_size-1, k)) */
+        RISCV_LOAD_COLUMN (a_r, a_i, A, ai, m_size);
+        ai += m_size * 2;
+
+        /* for I = 0..n_size-1
+             acc_rI[0..m_size-1] += real(A(m_top..m_top+msize-1, k) * B(k, ntop+I))
+             acc_iI[0..m_size-1] += imag(A(m_top..m_top+msize-1, k) * B(k, ntop+I)) */
+        RISCV_REPEAT_1 (RISCV_MUL, n_size, tmp_r, tmp_i, a_r, a_i, b_i, m_size);
+        RISCV_REPEAT_1 (RISCV_VFMACC, n_size, tmp_r, tmp_i, a_r, a_i, b_r, m_size);
+        RISCV_REPEAT_1 (RISCV_ADD, n_size, acc_r, acc_i, tmp_r, tmp_i, m_size);
+        RISCV_REPEAT_2 (RISCV_MUL, n_size, tmp_r, tmp_i, a_r, a_i, b_i, m_size);
+        RISCV_REPEAT_2 (RISCV_VFMACC, n_size, tmp_r, tmp_i, a_r, a_i, b_r, m_size);
+        RISCV_REPEAT_2 (RISCV_ADD, n_size, acc_r, acc_i, tmp_r, tmp_i, m_size);
+    }
+
+    BLASLONG ci = n_top * ldc + m_top;
+    VECTOR_T c_r0, c_i0, c_r1, c_i1, c_r2, c_i2, c_r3, c_i3;
+    VECTOR_T c_r4, c_i4, c_r5, c_i5, c_r6, c_i6, c_r7, c_i7;
+
+    /* for I = 0..nsize-1
+         c_rI[0..m_size-1] = real(C(m_top..m_top+m_size-1, n_top+I))
+         c_iI[0..m_size-1] = imag(C(m_top..m_top+m_size-1, n_top+I))
+         c_rI[0..m_size-1] += alpha_r * acc_rI[0..m_size-1]
+         c_iI[0..m_size-1] += alpha_i * acc_iI[0..m_size-1]
+         real(C(mtop..m_top+m_size-1, n_top+I)) = c_rI[0..m_size-1]
+         imag(C(mtop..m_top+m_size-1, n_top+I)) = c_iI[0..m_size-1] */
+    RISCV_REPEAT_1 (RISCV_LOAD, n_size, c_r, c_i, C, ci, ldc, m_size);
+    RISCV_REPEAT_2 (RISCV_LOAD, n_size, c_r, c_i, C, ci, ldc, m_size);
+    RISCV_REPEAT_1 (RISCV_ACC_MUL_CONSTR, n_size, c_r, c_i, acc_r, acc_i, alphar, m_size);
+    RISCV_REPEAT_2 (RISCV_ACC_MUL_CONSTR, n_size, c_r, c_i, acc_r, acc_i, alphar, m_size);
+    RISCV_REPEAT_1 (RISCV_ACC_MUL_CONSTI, n_size, c_r, c_i, acc_r, acc_i, alphai, m_size);
+    RISCV_REPEAT_2 (RISCV_ACC_MUL_CONSTI, n_size, c_r, c_i, acc_r, acc_i, alphai, m_size);
+    RISCV_REPEAT_1 (RISCV_STORE, n_size, C, c_r, c_i, ci, ldc, m_size);
+    RISCV_REPEAT_2 (RISCV_STORE, n_size, C, c_r, c_i, ci, ldc, m_size);
+
+    return m_top + m_size;
+}
+
+/* Perform matrix multiplication between submatrices:
+   A(M,K) * B(K,n_size) = C(M, n_size) */
+
+static inline __attribute__((always_inline))
+BLASLONG kernel_column (BLASLONG M, BLASLONG N, BLASLONG K,
+                        FLOAT alphar, FLOAT alphai,
+                        FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc,
+                        BLASLONG n_top, BLASLONG n_size)
+{
+    BLASLONG m_top = 0;
+
+    for (BLASLONG i = 0; i < M / M_BLOCKSIZE; i++)
+        m_top = kernel (M, N, K, alphar, alphai, A, B, C, ldc, m_top, n_top, M_BLOCKSIZE, n_size);
+
+    if (M & (M_BLOCKSIZE - 1))
+        kernel (M, N, K, alphar, alphai, A, B, C, ldc, m_top, n_top, M - m_top, n_size);
+
+    return n_top + n_size;
+}
+
+#define xstr(s) str(s)
+#define str(s) #s
+
+/* Perform matrix multiplication between matrices:
+   A(M,K) * B(K,N) = C(M,N) */
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+{
+    //fprintf(stderr, "%s (with VLV): M=%ld, N=%ld, K=%ld, ldc=%ld\n", xstr(CNAME), M, N, K, ldc);
+    BLASLONG n_top = 0;
+
+    for (BLASLONG j = 0; j < N / N_BLOCKSIZE; j++)
+        n_top = kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, N_BLOCKSIZE);
+
+#if N_BLOCKSIZE > 4
+    if (N & 4)
+        n_top = kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, 4);
+#endif
+#if N_BLOCKSIZE > 2
+    if (N & 2)
+        n_top = kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, 2);
+#endif
+#if N_BLOCKSIZE > 1
+    if (N & 1)
+        kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, 1);
+#endif
+    return 0;
+}
diff --git a/kernel/riscv64/zgemm_kernel_zvl128b.c b/kernel/riscv64/zgemm_kernel_zvl128b.c
new file mode 100644
index 0000000000..ad8773cc4b
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_zvl128b.c
@@ -0,0 +1,52 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_N
+
+#if M_BLOCKSIZE == 2
+#define RVV_MUL __riscv_vfmul_vf_f64m1
+#define RVV_LOAD __riscv_vlse64_v_f64m1
+#define RVV_STORE __riscv_vsse64_v_f64m1
+#define VECTOR_T vfloat64m1_t
+#elif M_BLOCKSIZE == 4
+#define RVV_MUL __riscv_vfmul_vf_f64m2
+#define RVV_LOAD __riscv_vlse64_v_f64m2
+#define RVV_STORE __riscv_vsse64_v_f64m2
+#define VECTOR_T vfloat64m2_t
+#elif M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f64m4
+#define RVV_LOAD __riscv_vlse64_v_f64m4
+#define RVV_STORE __riscv_vsse64_v_f64m4
+#define VECTOR_T vfloat64m4_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "zgemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/zgemm_kernel_zvl256b.c b/kernel/riscv64/zgemm_kernel_zvl256b.c
new file mode 100644
index 0000000000..084bedb080
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_zvl256b.c
@@ -0,0 +1,52 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_N
+
+#if M_BLOCKSIZE == 4
+#define RVV_MUL __riscv_vfmul_vf_f64m1
+#define RVV_LOAD __riscv_vlse64_v_f64m1
+#define RVV_STORE __riscv_vsse64_v_f64m1
+#define VECTOR_T vfloat64m1_t
+#elif M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f64m2
+#define RVV_LOAD __riscv_vlse64_v_f64m2
+#define RVV_STORE __riscv_vsse64_v_f64m2
+#define VECTOR_T vfloat64m2_t
+#elif M_BLOCKSIZE == 16
+#define RVV_MUL __riscv_vfmul_vf_f64m4
+#define RVV_LOAD __riscv_vlse64_v_f64m4
+#define RVV_STORE __riscv_vsse64_v_f64m4
+#define VECTOR_T vfloat64m4_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "zgemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/zgemm_tcopy_16_rvv_max.c b/kernel/riscv64/zgemm_tcopy_16_rvv_max.c
new file mode 100644
index 0000000000..8545989257
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_16_rvv_max.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define FLOAT_V_T               vfloat32m4_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m4
+#define VSEV_FLOAT              __riscv_vse32_v_f32m4
+#else
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#endif
+
+#define BLOCKSIZE 16
+
+#include "zgemm_tcopy_rvv_max_common.h"
diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv_max.c b/kernel/riscv64/zgemm_tcopy_4_rvv_max.c
new file mode 100644
index 0000000000..07b86a4862
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_4_rvv_max.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#else
+#define FLOAT_V_T               vfloat64m4_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VSEV_FLOAT              __riscv_vse64_v_f64m4
+#endif
+
+#define BLOCKSIZE 4
+
+#include "zgemm_tcopy_rvv_max_common.h"
diff --git a/kernel/riscv64/zgemm_tcopy_8_rvv_max.c b/kernel/riscv64/zgemm_tcopy_8_rvv_max.c
new file mode 100644
index 0000000000..53b28a5652
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_8_rvv_max.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#else
+#define FLOAT_V_T               vfloat64m4_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VSEV_FLOAT              __riscv_vse64_v_f64m4
+#endif
+
+#define BLOCKSIZE 8
+
+#include "zgemm_tcopy_rvv_max_common.h"
diff --git a/kernel/riscv64/zgemm_tcopy_rvv_max_common.h b/kernel/riscv64/zgemm_tcopy_rvv_max_common.h
new file mode 100644
index 0000000000..c0417a5e37
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_rvv_max_common.h
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define xstr(s) str(s)
+#define str(s) #s
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    IFLOAT *aoffset = a;
+    IFLOAT *boffset = b;
+    IFLOAT *boffset2  = b + m  * (n & ~(BLOCKSIZE - 1)) * 2;
+
+    //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", xstr(CNAME), m, n, lda);
+
+    for (BLASLONG j = m; j > 0; j--) {
+        IFLOAT *aoffset1 = aoffset;
+        IFLOAT *boffset1 = boffset;
+
+        aoffset += lda * 2;
+        boffset += BLOCKSIZE * 2;
+
+        for (BLASLONG i = n / BLOCKSIZE; i > 0; i--) {
+            size_t vl = BLOCKSIZE;
+
+            FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset1, v, vl);
+            v = VLEV_FLOAT(aoffset1 + vl, vl);
+            VSEV_FLOAT(boffset1 + vl, v, vl);
+
+            aoffset1 += BLOCKSIZE * 2;
+            boffset1 += BLOCKSIZE * m * 2;
+        }
+
+        if (n & (BLOCKSIZE - 1)) {
+            size_t vl = n & (BLOCKSIZE - 1);
+
+            FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset2, v, vl);
+            v = VLEV_FLOAT(aoffset1 + vl, vl);
+            VSEV_FLOAT(boffset2 + vl, v, vl);
+
+            boffset2 += vl * 2;
+        }
+    }
+
+    return 0;
+}
diff --git a/param.h b/param.h
index 8e598d8a01..8caa997c14 100644
--- a/param.h
+++ b/param.h
@@ -3231,7 +3231,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  8
 
-#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_M  4
 #define DGEMM_DEFAULT_UNROLL_N  8
 
 #define CGEMM_DEFAULT_UNROLL_M  8
@@ -3240,6 +3240,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_M  8
 #define ZGEMM_DEFAULT_UNROLL_N  4
 
+#define DTRMM_DEFAULT_UNROLL_M  8
+#define DTRMM_DEFAULT_UNROLL_N  8
+
 #undef SHGEMM_DEFAULT_P
 #define SHGEMM_DEFAULT_P 128
 #undef SBGEMM_DEFAULT_P