From 744eda853c1e3b9d3ff5b2e0dee49fe55c624dae Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung <kcyeung@baylibre.com>
Date: Wed, 3 Dec 2025 15:44:23 +0000
Subject: [PATCH 1/6] Add new SGEMM/DGEMM kernels for RISC-V ZVL128B/ZVL256B
 architectures

These kernels use the same algorithm as the original ones, but improve on them
in several respects:

- Common code is shared between the two kernel types and architectures,
reducing the overall lines of code.

- Instead of using auto-generated code with each vector operation repeated
multiple times (necessitated in part due to the vector types not being
storable in arrays), macros and forced inlining are used to achieve the
same effect directly for better clarity and a major reduction in lines of
code.

- The tiling sizes can be modified within a certain range by modifying the
unroll parameters.

- The RVV extension is not restricted to having the number of elements in a
vector register being a power of two, so the tails of the matrix involving
vector operations can be dealt with in a single operation rather than in
decreasing powers of two, thereby improving performance.  The tcopy operations
need to be modified to take this into account.
---
 kernel/riscv64/dgemm_kernel_zvl128b.c       |  55 ++++++
 kernel/riscv64/dgemm_kernel_zvl256b.c       |  55 ++++++
 kernel/riscv64/gemm_kernel_rvv_vlv_common.h | 179 ++++++++++++++++++++
 kernel/riscv64/gemm_tcopy_16_rvv_max.c      |  42 +++++
 kernel/riscv64/gemm_tcopy_4_rvv_max.c       |  42 +++++
 kernel/riscv64/gemm_tcopy_8_rvv_max.c       |  42 +++++
 kernel/riscv64/gemm_tcopy_rvv_max_common.h  |  67 ++++++++
 kernel/riscv64/sgemm_kernel_zvl128b.c       |  55 ++++++
 kernel/riscv64/sgemm_kernel_zvl256b.c       |  49 ++++++
 9 files changed, 586 insertions(+)
 create mode 100644 kernel/riscv64/dgemm_kernel_zvl128b.c
 create mode 100644 kernel/riscv64/dgemm_kernel_zvl256b.c
 create mode 100644 kernel/riscv64/gemm_kernel_rvv_vlv_common.h
 create mode 100644 kernel/riscv64/gemm_tcopy_16_rvv_max.c
 create mode 100644 kernel/riscv64/gemm_tcopy_4_rvv_max.c
 create mode 100644 kernel/riscv64/gemm_tcopy_8_rvv_max.c
 create mode 100644 kernel/riscv64/gemm_tcopy_rvv_max_common.h
 create mode 100644 kernel/riscv64/sgemm_kernel_zvl128b.c
 create mode 100644 kernel/riscv64/sgemm_kernel_zvl256b.c

diff --git a/kernel/riscv64/dgemm_kernel_zvl128b.c b/kernel/riscv64/dgemm_kernel_zvl128b.c
new file mode 100644
index 0000000000..45a9400063
--- /dev/null
+++ b/kernel/riscv64/dgemm_kernel_zvl128b.c
@@ -0,0 +1,55 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE DGEMM_UNROLL_M
+#define N_BLOCKSIZE DGEMM_UNROLL_N
+
+#if M_BLOCKSIZE == 2
+#define RVV_MUL __riscv_vfmul_vf_f64m1
+#define RVV_MACC __riscv_vfmacc_vf_f64m1
+#define RVV_LOAD __riscv_vle64_v_f64m1
+#define RVV_STORE __riscv_vse64_v_f64m1
+#define VECTOR_T vfloat64m1_t
+#elif M_BLOCKSIZE == 4
+#define RVV_MUL __riscv_vfmul_vf_f64m2
+#define RVV_MACC __riscv_vfmacc_vf_f64m2
+#define RVV_LOAD __riscv_vle64_v_f64m2
+#define RVV_STORE __riscv_vse64_v_f64m2
+#define VECTOR_T vfloat64m2_t
+#elif M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f64m4
+#define RVV_MACC __riscv_vfmacc_vf_f64m4
+#define RVV_LOAD __riscv_vle64_v_f64m4
+#define RVV_STORE __riscv_vse64_v_f64m4
+#define VECTOR_T vfloat64m4_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "gemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/dgemm_kernel_zvl256b.c b/kernel/riscv64/dgemm_kernel_zvl256b.c
new file mode 100644
index 0000000000..84e66ab721
--- /dev/null
+++ b/kernel/riscv64/dgemm_kernel_zvl256b.c
@@ -0,0 +1,55 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE DGEMM_UNROLL_M
+#define N_BLOCKSIZE DGEMM_UNROLL_N
+
+#if M_BLOCKSIZE == 4
+#define RVV_MUL __riscv_vfmul_vf_f64m1
+#define RVV_MACC __riscv_vfmacc_vf_f64m1
+#define RVV_LOAD __riscv_vle64_v_f64m1
+#define RVV_STORE __riscv_vse64_v_f64m1
+#define VECTOR_T vfloat64m1_t
+#elif M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f64m2
+#define RVV_MACC __riscv_vfmacc_vf_f64m2
+#define RVV_LOAD __riscv_vle64_v_f64m2
+#define RVV_STORE __riscv_vse64_v_f64m2
+#define VECTOR_T vfloat64m2_t
+#elif M_BLOCKSIZE == 16
+#define RVV_MUL __riscv_vfmul_vf_f64m4
+#define RVV_MACC __riscv_vfmacc_vf_f64m4
+#define RVV_LOAD __riscv_vle64_v_f64m4
+#define RVV_STORE __riscv_vse64_v_f64m4
+#define VECTOR_T vfloat64m4_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "gemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/gemm_kernel_rvv_vlv_common.h b/kernel/riscv64/gemm_kernel_rvv_vlv_common.h
new file mode 100644
index 0000000000..31c2f0fdc6
--- /dev/null
+++ b/kernel/riscv64/gemm_kernel_rvv_vlv_common.h
@@ -0,0 +1,179 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define RISCV_REPEAT(INSN, BLEN, ...) \
+do { \
+    INSN (0, __VA_ARGS__); \
+    if (BLEN == 1) break; \
+    INSN (1, __VA_ARGS__); \
+    if (BLEN == 2) break; \
+    INSN (2, __VA_ARGS__); \
+    INSN (3, __VA_ARGS__); \
+    if (BLEN == 4) break; \
+    INSN (4, __VA_ARGS__); \
+    INSN (5, __VA_ARGS__); \
+    INSN (6, __VA_ARGS__); \
+    INSN (7, __VA_ARGS__); \
+} while (0)
+
+#define RISCV_MUL(N, DEST, A, B, LEN) \
+    DEST##N = RVV_MUL(A, B[N], LEN);
+
+#define RISCV_ACC_MUL(N, DEST, A, B, LEN) \
+    DEST##N = RVV_MACC(DEST##N, B[N], A, LEN);
+
+#define RISCV_ACC_MUL_CONST(N, DEST, A, B, LEN) \
+    DEST##N = RVV_MACC(DEST##N, B, A##N, LEN);
+
+#define RISCV_LOAD(N, DEST, SRC, OFFSET, LDC, LEN) \
+    DEST##N = RVV_LOAD((SRC) + (OFFSET) + N*(LDC), LEN);
+
+#define RISCV_STORE(N, DEST, SRC, OFFSET, LDC, LEN) \
+    RVV_STORE((DEST) + (OFFSET) + N*(LDC), SRC##N, LEN);
+
+#define RISCV_LOAD_COLUMN(DEST, SRC, OFFSET, LEN) \
+    DEST = RVV_LOAD((SRC) + (OFFSET), LEN);
+
+#define COPY_ROW(DEST, SRC, OFFSET, LEN) \
+do { \
+    DEST[0] = SRC[OFFSET]; \
+    if (LEN == 1) break; \
+    DEST[1] = SRC[OFFSET + 1]; \
+    if (LEN == 2) break; \
+    DEST[2] = SRC[OFFSET + 2]; \
+    DEST[3] = SRC[OFFSET + 3]; \
+    if (LEN == 4) break; \
+    DEST[4] = SRC[OFFSET + 4]; \
+    DEST[5] = SRC[OFFSET + 5]; \
+    DEST[6] = SRC[OFFSET + 6]; \
+    DEST[7] = SRC[OFFSET + 7]; \
+} while (0)
+
+/* Perform matrix multiplication between submatrices:
+   A(m_size,K) * B(K,n_size) = C(m_size,n_size) */
+
+static inline __attribute__((always_inline))
+BLASLONG kernel (BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha,
+                 FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc,
+                 BLASLONG m_top, BLASLONG n_top,
+                 BLASLONG m_size, BLASLONG n_size)
+{
+    BLASLONG ai = m_top*K;
+    BLASLONG bi = n_top*K;
+
+    /* b[0..n_size-1] = B(0, n_top)..B(0, n_top+n_size-1) */
+    FLOAT b[N_BLOCKSIZE];
+    COPY_ROW (b, B, bi, n_size);
+    bi += n_size;
+
+    /* a[0..m_size-1] = A(m_top, 0)..A(m_top+m_size-1, 0) */
+    VECTOR_T a;
+    RISCV_LOAD_COLUMN (a, A, ai, m_size);
+    ai += m_size;
+
+    /* for I = 0..n_size-1
+         resultI[0..m_size-1] = A(m_top..m_top+msize-1, 0) * B(0, ntop+I) */
+    VECTOR_T result0, result1, result2, result3;
+    VECTOR_T result4, result5, result6, result7;
+    RISCV_REPEAT (RISCV_MUL, n_size, result, a, b, m_size);
+
+    for (BLASLONG k = 1; k < K; k++) {
+        /* b[0..n_size-1] = B(k, n_top)..B(k, n_top+n_size-1) */
+        COPY_ROW (b, B, bi, n_size);
+        bi += n_size;
+
+        /* a[0..m_size-1] = A(m_top, k)..A(m_top+m_size-1, k) */
+        RISCV_LOAD_COLUMN (a, A, ai, m_size);
+        ai += m_size;
+
+        /* for I = 0..n_size-1
+             resultI[0..m_size-1] += A(m_top..m_top+msize-1, k) * B(k, ntop+I) */
+        RISCV_REPEAT (RISCV_ACC_MUL, n_size, result, a, b, m_size);
+    }
+
+    BLASLONG ci = n_top * ldc + m_top;
+    VECTOR_T c0, c1, c2, c3, c4, c5, c6, c7;
+
+    /* for I = 0..nsize-1
+         cI[0..m_size-1] = C(m_top..m_top+m_size-1, n_top+I)
+         cI[0..m_size-1] += alpha * resultI[0..m_size-1]
+         C(mtop..m_top+m_size-1, n_top+I) = cI[0..m_size-1] */
+    RISCV_REPEAT (RISCV_LOAD, n_size, c, C, ci, ldc, m_size);
+    RISCV_REPEAT (RISCV_ACC_MUL_CONST, n_size, c, result, alpha, m_size);
+    RISCV_REPEAT (RISCV_STORE, n_size, C, c, ci, ldc, m_size);
+
+    return m_top + m_size;
+}
+
+/* Perform matrix multiplication between submatrices:
+   A(M,K) * B(K,n_size) = C(M, n_size) */
+
+static inline __attribute__((always_inline))
+BLASLONG kernel_column (BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha,
+                        FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc,
+                        BLASLONG n_top, BLASLONG n_size)
+{
+    BLASLONG m_top = 0;
+
+    for (BLASLONG i = 0; i < M / M_BLOCKSIZE; i++)
+      m_top = kernel (M, N, K, alpha, A, B, C, ldc, m_top, n_top, M_BLOCKSIZE, n_size);
+
+    if (M & (M_BLOCKSIZE - 1))
+        kernel (M, N, K, alpha, A, B, C, ldc, m_top, n_top, M - m_top, n_size);
+
+    return n_top + n_size;
+}
+
+#define xstr(s) str(s)
+#define str(s) #s
+
+/* Perform matrix multiplication between matrices:
+   A(M,K) * B(K,N) = C(M,N) */
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+{
+    //fprintf(stderr, "%s (with VLV): M=%ld, N=%ld, K=%ld, ldc=%ld, m_blocksize=%d, n_blocksize=%d\n", xstr(CNAME), M, N, K, ldc, M_BLOCKSIZE, N_BLOCKSIZE);
+    BLASLONG n_top = 0;
+
+    for (BLASLONG j = 0; j < N / N_BLOCKSIZE; j++)
+        n_top = kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, N_BLOCKSIZE);
+
+#if N_BLOCKSIZE > 4
+    if (N & 4)
+        n_top = kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, 4);
+#endif
+#if N_BLOCKSIZE > 2
+    if (N & 2)
+        n_top = kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, 2);
+#endif
+#if N_BLOCKSIZE > 1
+    if (N & 1)
+        kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, 1);
+#endif
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_tcopy_16_rvv_max.c b/kernel/riscv64/gemm_tcopy_16_rvv_max.c
new file mode 100644
index 0000000000..85975e28be
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_16_rvv_max.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define FLOAT_V_T               vfloat32m4_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m4
+#define VSEV_FLOAT              __riscv_vse32_v_f32m4
+#else
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#endif
+
+#define BLOCKSIZE 16
+
+#include "gemm_tcopy_rvv_max_common.h"
diff --git a/kernel/riscv64/gemm_tcopy_4_rvv_max.c b/kernel/riscv64/gemm_tcopy_4_rvv_max.c
new file mode 100644
index 0000000000..15af02bd86
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_4_rvv_max.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define FLOAT_V_T               vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#else
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#endif
+
+#define BLOCKSIZE 4
+
+#include "gemm_tcopy_rvv_max_common.h"
diff --git a/kernel/riscv64/gemm_tcopy_8_rvv_max.c b/kernel/riscv64/gemm_tcopy_8_rvv_max.c
new file mode 100644
index 0000000000..0aa0358369
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_8_rvv_max.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#else
+#define FLOAT_V_T               vfloat64m4_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VSEV_FLOAT              __riscv_vse64_v_f64m4
+#endif
+
+#define BLOCKSIZE 8
+
+#include "gemm_tcopy_rvv_max_common.h"
diff --git a/kernel/riscv64/gemm_tcopy_rvv_max_common.h b/kernel/riscv64/gemm_tcopy_rvv_max_common.h
new file mode 100644
index 0000000000..425650cb35
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_rvv_max_common.h
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define xstr(s) str(s)
+#define str(s) #s
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    IFLOAT *aoffset  = a;
+    IFLOAT *boffset  = b;
+    IFLOAT *boffset2 = b + m  * (n & ~(BLOCKSIZE - 1));
+
+    //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", xstr(CNAME), m, n, lda);
+
+    for(BLASLONG j = m; j > 0; j--) {
+        IFLOAT *aoffset1 = aoffset;
+        IFLOAT *boffset1 = boffset;
+
+        aoffset += lda;
+        boffset += BLOCKSIZE;
+
+        for(BLASLONG i = n / BLOCKSIZE; i > 0; i--) {
+            size_t vl = BLOCKSIZE;
+
+            FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset1, v, vl);
+
+            aoffset1 += BLOCKSIZE;
+            boffset1 += BLOCKSIZE * m;
+        }
+
+        if (n & (BLOCKSIZE - 1)) {
+            size_t vl = n & (BLOCKSIZE - 1);
+
+            FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset2, v, vl);
+
+            boffset2 += vl;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/sgemm_kernel_zvl128b.c b/kernel/riscv64/sgemm_kernel_zvl128b.c
new file mode 100644
index 0000000000..60e9d62ac0
--- /dev/null
+++ b/kernel/riscv64/sgemm_kernel_zvl128b.c
@@ -0,0 +1,55 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE SGEMM_UNROLL_M
+#define N_BLOCKSIZE SGEMM_UNROLL_N
+
+#if M_BLOCKSIZE == 4
+#define RVV_MUL __riscv_vfmul_vf_f32m1
+#define RVV_MACC __riscv_vfmacc_vf_f32m1
+#define RVV_LOAD __riscv_vle32_v_f32m1
+#define RVV_STORE __riscv_vse32_v_f32m1
+#define VECTOR_T vfloat32m1_t
+#elif M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f32m2
+#define RVV_MACC __riscv_vfmacc_vf_f32m2
+#define RVV_LOAD __riscv_vle32_v_f32m2
+#define RVV_STORE __riscv_vse32_v_f32m2
+#define VECTOR_T vfloat32m2_t
+#elif M_BLOCKSIZE == 16
+#define RVV_MUL __riscv_vfmul_vf_f32m4
+#define RVV_MACC __riscv_vfmacc_vf_f32m4
+#define RVV_LOAD __riscv_vle32_v_f32m4
+#define RVV_STORE __riscv_vse32_v_f32m4
+#define VECTOR_T vfloat32m4_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "gemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/sgemm_kernel_zvl256b.c b/kernel/riscv64/sgemm_kernel_zvl256b.c
new file mode 100644
index 0000000000..925e01d102
--- /dev/null
+++ b/kernel/riscv64/sgemm_kernel_zvl256b.c
@@ -0,0 +1,49 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE SGEMM_UNROLL_M
+#define N_BLOCKSIZE SGEMM_UNROLL_N
+
+#if M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f32m1
+#define RVV_MACC __riscv_vfmacc_vf_f32m1
+#define RVV_LOAD __riscv_vle32_v_f32m1
+#define RVV_STORE __riscv_vse32_v_f32m1
+#define VECTOR_T vfloat32m1_t
+#elif M_BLOCKSIZE == 16
+#define RVV_MUL __riscv_vfmul_vf_f32m2
+#define RVV_MACC __riscv_vfmacc_vf_f32m2
+#define RVV_LOAD __riscv_vle32_v_f32m2
+#define RVV_STORE __riscv_vse32_v_f32m2
+#define VECTOR_T vfloat32m2_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "gemm_kernel_rvv_vlv_common.h"

From 65108acf546f179498cf18330e7e46793c691eaf Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung <kcyeung@baylibre.com>
Date: Wed, 3 Dec 2025 15:48:59 +0000
Subject: [PATCH 2/6] Add improved CGEMM and ZGEMM kernels for the RISC-V
 ZVL128B and ZVL256B architectures

This applies the changes made to the SGEMM/DGEMM kernels to their complex
CGEMM/ZGEMM equivalents.
---
 kernel/riscv64/cgemm_kernel_zvl128b.c        |  52 ++++
 kernel/riscv64/cgemm_kernel_zvl256b.c        |  48 ++++
 kernel/riscv64/zgemm_kernel_rvv_vlv_common.h | 268 +++++++++++++++++++
 kernel/riscv64/zgemm_kernel_zvl128b.c        |  52 ++++
 kernel/riscv64/zgemm_kernel_zvl256b.c        |  52 ++++
 kernel/riscv64/zgemm_tcopy_16_rvv_max.c      |  42 +++
 kernel/riscv64/zgemm_tcopy_4_rvv_max.c       |  42 +++
 kernel/riscv64/zgemm_tcopy_8_rvv_max.c       |  42 +++
 kernel/riscv64/zgemm_tcopy_rvv_max_common.h  |  71 +++++
 9 files changed, 669 insertions(+)
 create mode 100644 kernel/riscv64/cgemm_kernel_zvl128b.c
 create mode 100644 kernel/riscv64/cgemm_kernel_zvl256b.c
 create mode 100644 kernel/riscv64/zgemm_kernel_rvv_vlv_common.h
 create mode 100644 kernel/riscv64/zgemm_kernel_zvl128b.c
 create mode 100644 kernel/riscv64/zgemm_kernel_zvl256b.c
 create mode 100644 kernel/riscv64/zgemm_tcopy_16_rvv_max.c
 create mode 100644 kernel/riscv64/zgemm_tcopy_4_rvv_max.c
 create mode 100644 kernel/riscv64/zgemm_tcopy_8_rvv_max.c
 create mode 100644 kernel/riscv64/zgemm_tcopy_rvv_max_common.h

diff --git a/kernel/riscv64/cgemm_kernel_zvl128b.c b/kernel/riscv64/cgemm_kernel_zvl128b.c
new file mode 100644
index 0000000000..c37d365775
--- /dev/null
+++ b/kernel/riscv64/cgemm_kernel_zvl128b.c
@@ -0,0 +1,52 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE CGEMM_UNROLL_M
+#define N_BLOCKSIZE CGEMM_UNROLL_N
+
+#if M_BLOCKSIZE == 4
+#define RVV_MUL __riscv_vfmul_vf_f32m1
+#define RVV_LOAD __riscv_vlse32_v_f32m1
+#define RVV_STORE __riscv_vsse32_v_f32m1
+#define VECTOR_T vfloat32m1_t
+#elif M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f32m2
+#define RVV_LOAD __riscv_vlse32_v_f32m2
+#define RVV_STORE __riscv_vsse32_v_f32m2
+#define VECTOR_T vfloat32m2_t
+#elif M_BLOCKSIZE == 16
+#define RVV_MUL __riscv_vfmul_vf_f32m4
+#define RVV_LOAD __riscv_vlse32_v_f32m4
+#define RVV_STORE __riscv_vsse32_v_f32m4
+#define VECTOR_T vfloat32m4_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "zgemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/cgemm_kernel_zvl256b.c b/kernel/riscv64/cgemm_kernel_zvl256b.c
new file mode 100644
index 0000000000..8b991bfc4f
--- /dev/null
+++ b/kernel/riscv64/cgemm_kernel_zvl256b.c
@@ -0,0 +1,48 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+
+#define M_BLOCKSIZE CGEMM_UNROLL_M
+#define N_BLOCKSIZE CGEMM_UNROLL_N
+
+#if M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f32m1
+#define RVV_LOAD __riscv_vlse32_v_f32m1
+#define RVV_STORE __riscv_vsse32_v_f32m1
+#define VECTOR_T vfloat32m1_t
+#elif M_BLOCKSIZE == 16
+#define RVV_MUL __riscv_vfmul_vf_f32m2
+#define RVV_LOAD __riscv_vlse32_v_f32m2
+#define RVV_STORE __riscv_vsse32_v_f32m2
+#define VECTOR_T vfloat32m2_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "zgemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/zgemm_kernel_rvv_vlv_common.h b/kernel/riscv64/zgemm_kernel_rvv_vlv_common.h
new file mode 100644
index 0000000000..02f443f711
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_rvv_vlv_common.h
@@ -0,0 +1,268 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+#define RISCV_REPEAT_1(INSN, BLEN, ...) \
+do { \
+    INSN (0, 0, __VA_ARGS__); \
+    if (BLEN == 1) break; \
+    INSN (1, 1, __VA_ARGS__); \
+    if (BLEN == 2) break; \
+    INSN (2, 2, __VA_ARGS__); \
+    INSN (3, 3, __VA_ARGS__); \
+} while (0)
+
+#define RISCV_REPEAT_2(INSN, BLEN, ...) \
+do { \
+    if (BLEN <= 4) break; \
+    INSN (0, 4, __VA_ARGS__); \
+    INSN (1, 5, __VA_ARGS__); \
+    INSN (2, 6, __VA_ARGS__); \
+    INSN (3, 7, __VA_ARGS__); \
+} while (0)
+
+#define RISCV_MUL(M, N, DESTr, DESTi, Ar, Ai, Bi, GVL) \
+    DESTr##M = RVV_MUL(Ai, Bi[N], GVL); \
+    DESTi##M = RVV_MUL(Ar, Bi[N], GVL);
+
+#define RISCV_VFMACC(M, N, DESTr, DESTi, Ar, Ai, Br, GVL) \
+    DESTr##M = VFMACC_RR(DESTr##M, Br[N], Ar, GVL); \
+    DESTi##M = VFMACC_RI(DESTi##M, Br[N], Ai, GVL);
+
+#define RISCV_ACC_MUL_CONSTR(M, N, DESTr, DESTi, Ar, Ai, B, GVL) \
+    DESTr##N = __riscv_vfmacc(DESTr##N, B, Ar##N, GVL); \
+    DESTi##N = __riscv_vfmacc(DESTi##N, B, Ai##N, GVL);
+
+#define RISCV_ACC_MUL_CONSTI(M, N, DESTr, DESTi, Ar, Ai, B, GVL) \
+    DESTr##N = __riscv_vfnmsac(DESTr##N, B, Ai##N, GVL); \
+    DESTi##N = __riscv_vfmacc(DESTi##N, B, Ar##N, GVL);
+
+#define RISCV_LOAD(M, N, DESTr, DESTi, SRC, OFFSET, LDC, GVL) \
+    DESTr##N = RVV_LOAD((SRC) + ((OFFSET) + N*(LDC)) * 2, sizeof(FLOAT)*2, GVL); \
+    DESTi##N = RVV_LOAD((SRC) + ((OFFSET) + N*(LDC)) * 2 + 1, sizeof(FLOAT)*2, GVL);
+
+#define RISCV_STORE(M, N, DEST, SRCr, SRCi, OFFSET, LDC, GVL) \
+    RVV_STORE((DEST) + ((OFFSET) + N*(LDC)) * 2, sizeof(FLOAT)*2, SRCr##N, GVL); \
+    RVV_STORE((DEST) + ((OFFSET) + N*(LDC)) * 2 + 1, sizeof(FLOAT)*2, SRCi##N, GVL);
+
+#define RISCV_LOAD_COLUMN(DESTR, DESTI, SRC, OFFSET, GVL) \
+    DESTR = RVV_LOAD((SRC) + (OFFSET), sizeof (FLOAT)*2, GVL); \
+    DESTI = RVV_LOAD((SRC) + (OFFSET) + 1, sizeof (FLOAT)*2, GVL);
+
+#define COPY_TMP(M, N, DESTr, DESTi, SRCr, SRCi) \
+    DESTr##N = SRCr##M; \
+    DESTi##N = SRCi##M;
+
+#define RISCV_ADD(M, N, DESTr, DESTi, SRCr, SRCi, GVL) \
+    DESTr##N = __riscv_vfadd(DESTr##N, SRCr##M, GVL); \
+    DESTi##N = __riscv_vfadd(DESTi##N, SRCi##M, GVL);
+
+#define COPY_ROW(DESTR, DESTI, SRC, OFFSET, LEN) \
+do { \
+    DESTR[0] = SRC[OFFSET]; \
+    DESTI[0] = SRC[OFFSET + 1]; \
+    if (LEN == 1) break; \
+    DESTR[1] = SRC[OFFSET + 2]; \
+    DESTI[1] = SRC[OFFSET + 3]; \
+    if (LEN == 2) break; \
+    DESTR[2] = SRC[OFFSET + 4]; \
+    DESTI[2] = SRC[OFFSET + 5]; \
+    DESTR[3] = SRC[OFFSET + 6]; \
+    DESTI[3] = SRC[OFFSET + 7]; \
+    if (LEN == 4) break; \
+    DESTR[4] = SRC[OFFSET + 8]; \
+    DESTI[4] = SRC[OFFSET + 9]; \
+    DESTR[5] = SRC[OFFSET + 10]; \
+    DESTI[5] = SRC[OFFSET + 11]; \
+    DESTR[6] = SRC[OFFSET + 12]; \
+    DESTI[6] = SRC[OFFSET + 13]; \
+    DESTR[7] = SRC[OFFSET + 14]; \
+    DESTI[7] = SRC[OFFSET + 15]; \
+} while (0)
+
+/* Perform matrix multiplication between submatrices:
+   A(m_size,K) * B(K,n_size) = C(m_size,n_size) */
+
+static inline __attribute__((always_inline))
+BLASLONG kernel (BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai,
+                 FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc,
+                 BLASLONG m_top, BLASLONG n_top,
+                 BLASLONG m_size, BLASLONG n_size)
+{
+    BLASLONG ai = m_top*K*2;
+    BLASLONG bi = n_top*K*2;
+
+    /* b_r[0..n_size-1] = real(B(0, n_top)..B(0, n_top+n_size-1))
+       b_i[0..n_size-1] = imag(B(0, n_top)..B(0, n_top+n_size-1)) */
+    FLOAT b_r[N_BLOCKSIZE], b_i[N_BLOCKSIZE];
+    COPY_ROW (b_r, b_i, B, bi, n_size);
+    bi += n_size * 2;
+
+    /* a_r[0..m_size-1] = real(A(m_top, 0)..A(m_top+m_size-1, 0))
+       a_i[0..m_size-1] = imag(A(m_top, 0)..A(m_top+m_size-1, 0)) */
+    VECTOR_T a_r, a_i;
+    RISCV_LOAD_COLUMN (a_r, a_i, A, ai, m_size);
+    ai += m_size * 2;
+
+    /* for I = 0..n_size-1
+         acc_rI[0..m_size-1] = real(A(m_top..m_top+msize-1, 0) * B(0, ntop+I))
+         acc_iI[0..m_size-1] = imag(A(m_top..m_top+msize-1, 0) * B(0, ntop+I)) */
+    VECTOR_T tmp_r0, tmp_i0, tmp_r1, tmp_i1, tmp_r2, tmp_i2, tmp_r3, tmp_i3;
+    VECTOR_T acc_r0, acc_i0, acc_r1, acc_i1, acc_r2, acc_i2, acc_r3, acc_i3;
+    VECTOR_T acc_r4, acc_i4, acc_r5, acc_i5, acc_r6, acc_i6, acc_r7, acc_i7;
+    RISCV_REPEAT_1 (RISCV_MUL, n_size, tmp_r, tmp_i, a_r, a_i, b_i, m_size);
+    RISCV_REPEAT_1 (RISCV_VFMACC, n_size, tmp_r, tmp_i, a_r, a_i, b_r, m_size);
+    RISCV_REPEAT_1 (COPY_TMP, n_size, acc_r, acc_i, tmp_r, tmp_i);
+    RISCV_REPEAT_2 (RISCV_MUL, n_size, tmp_r, tmp_i, a_r, a_i, b_i, m_size);
+    RISCV_REPEAT_2 (RISCV_VFMACC, n_size, tmp_r, tmp_i, a_r, a_i, b_r, m_size);
+    RISCV_REPEAT_2 (COPY_TMP, n_size, acc_r, acc_i, tmp_r, tmp_i);
+
+    for (BLASLONG k = 1; k < K; k++) {
+        /* b_r[0..n_size-1] = real(B(k, n_top)..B(k, n_top+n_size-1))
+           b_i[0..n_size-1] = imag(B(k, n_top)..B(k, n_top+n_size-1)) */
+        COPY_ROW (b_r, b_i, B, bi, n_size);
+        bi += n_size * 2;
+
+        /* a_r[0..m_size-1] = real(A(m_top, k)..A(m_top+m_size-1, k))
+           a_i[0..m_size-1] = imag(A(m_top, k)..A(m_top+m_size-1, k)) */
+        RISCV_LOAD_COLUMN (a_r, a_i, A, ai, m_size);
+        ai += m_size * 2;
+
+        /* for I = 0..n_size-1
+             acc_rI[0..m_size-1] += real(A(m_top..m_top+msize-1, k) * B(k, ntop+I))
+             acc_iI[0..m_size-1] += imag(A(m_top..m_top+msize-1, k) * B(k, ntop+I)) */
+        RISCV_REPEAT_1 (RISCV_MUL, n_size, tmp_r, tmp_i, a_r, a_i, b_i, m_size);
+        RISCV_REPEAT_1 (RISCV_VFMACC, n_size, tmp_r, tmp_i, a_r, a_i, b_r, m_size);
+        RISCV_REPEAT_1 (RISCV_ADD, n_size, acc_r, acc_i, tmp_r, tmp_i, m_size);
+        RISCV_REPEAT_2 (RISCV_MUL, n_size, tmp_r, tmp_i, a_r, a_i, b_i, m_size);
+        RISCV_REPEAT_2 (RISCV_VFMACC, n_size, tmp_r, tmp_i, a_r, a_i, b_r, m_size);
+        RISCV_REPEAT_2 (RISCV_ADD, n_size, acc_r, acc_i, tmp_r, tmp_i, m_size);
+    }
+
+    BLASLONG ci = n_top * ldc + m_top;
+    VECTOR_T c_r0, c_i0, c_r1, c_i1, c_r2, c_i2, c_r3, c_i3;
+    VECTOR_T c_r4, c_i4, c_r5, c_i5, c_r6, c_i6, c_r7, c_i7;
+
+    /* for I = 0..nsize-1
+         c_rI[0..m_size-1] = real(C(m_top..m_top+m_size-1, n_top+I))
+         c_iI[0..m_size-1] = imag(C(m_top..m_top+m_size-1, n_top+I))
+         c_rI[0..m_size-1] += alpha_r * acc_rI[0..m_size-1]
+         c_iI[0..m_size-1] += alpha_i * acc_iI[0..m_size-1]
+         real(C(mtop..m_top+m_size-1, n_top+I)) = c_rI[0..m_size-1]
+         imag(C(mtop..m_top+m_size-1, n_top+I)) = c_iI[0..m_size-1] */
+    RISCV_REPEAT_1 (RISCV_LOAD, n_size, c_r, c_i, C, ci, ldc, m_size);
+    RISCV_REPEAT_2 (RISCV_LOAD, n_size, c_r, c_i, C, ci, ldc, m_size);
+    RISCV_REPEAT_1 (RISCV_ACC_MUL_CONSTR, n_size, c_r, c_i, acc_r, acc_i, alphar, m_size);
+    RISCV_REPEAT_2 (RISCV_ACC_MUL_CONSTR, n_size, c_r, c_i, acc_r, acc_i, alphar, m_size);
+    RISCV_REPEAT_1 (RISCV_ACC_MUL_CONSTI, n_size, c_r, c_i, acc_r, acc_i, alphai, m_size);
+    RISCV_REPEAT_2 (RISCV_ACC_MUL_CONSTI, n_size, c_r, c_i, acc_r, acc_i, alphai, m_size);
+    RISCV_REPEAT_1 (RISCV_STORE, n_size, C, c_r, c_i, ci, ldc, m_size);
+    RISCV_REPEAT_2 (RISCV_STORE, n_size, C, c_r, c_i, ci, ldc, m_size);
+
+    return m_top + m_size;
+}
+
+/* Perform matrix multiplication between submatrices:
+   A(M,K) * B(K,n_size) = C(M, n_size) */
+
+static inline __attribute__((always_inline))
+BLASLONG kernel_column (BLASLONG M, BLASLONG N, BLASLONG K,
+                        FLOAT alphar, FLOAT alphai,
+                        FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc,
+                        BLASLONG n_top, BLASLONG n_size)
+{
+    BLASLONG m_top = 0;
+
+    for (BLASLONG i = 0; i < M / M_BLOCKSIZE; i++)
+        m_top = kernel (M, N, K, alphar, alphai, A, B, C, ldc, m_top, n_top, M_BLOCKSIZE, n_size);
+
+    if (M & (M_BLOCKSIZE - 1))
+        kernel (M, N, K, alphar, alphai, A, B, C, ldc, m_top, n_top, M - m_top, n_size);
+
+    return n_top + n_size;
+}
+
+#define xstr(s) str(s)
+#define str(s) #s
+
+/* Perform matrix multiplication between matrices:
+   A(M,K) * B(K,N) = C(M,N) */
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+{
+    //fprintf(stderr, "%s (with VLV): M=%ld, N=%ld, K=%ld, ldc=%ld\n", xstr(CNAME), M, N, K, ldc);
+    BLASLONG n_top = 0;
+
+    for (BLASLONG j = 0; j < N / N_BLOCKSIZE; j++)
+        n_top = kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, N_BLOCKSIZE);
+
+#if N_BLOCKSIZE > 4
+    if (N & 4)
+        n_top = kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, 4);
+#endif
+#if N_BLOCKSIZE > 2
+    if (N & 2)
+        n_top = kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, 2);
+#endif
+#if N_BLOCKSIZE > 1
+    if (N & 1)
+        kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, 1);
+#endif
+    return 0;
+}
diff --git a/kernel/riscv64/zgemm_kernel_zvl128b.c b/kernel/riscv64/zgemm_kernel_zvl128b.c
new file mode 100644
index 0000000000..ba8496e425
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_zvl128b.c
@@ -0,0 +1,52 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE ZGEMM_UNROLL_M
+#define N_BLOCKSIZE ZGEMM_UNROLL_N
+
+#if M_BLOCKSIZE == 2
+#define RVV_MUL __riscv_vfmul_vf_f64m1
+#define RVV_LOAD __riscv_vlse64_v_f64m1
+#define RVV_STORE __riscv_vsse64_v_f64m1
+#define VECTOR_T vfloat64m1_t
+#elif M_BLOCKSIZE == 4
+#define RVV_MUL __riscv_vfmul_vf_f64m2
+#define RVV_LOAD __riscv_vlse64_v_f64m2
+#define RVV_STORE __riscv_vsse64_v_f64m2
+#define VECTOR_T vfloat64m2_t
+#elif M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f64m4
+#define RVV_LOAD __riscv_vlse64_v_f64m4
+#define RVV_STORE __riscv_vsse64_v_f64m4
+#define VECTOR_T vfloat64m4_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "zgemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/zgemm_kernel_zvl256b.c b/kernel/riscv64/zgemm_kernel_zvl256b.c
new file mode 100644
index 0000000000..ff32ba60a5
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_zvl256b.c
@@ -0,0 +1,52 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define M_BLOCKSIZE ZGEMM_UNROLL_M
+#define N_BLOCKSIZE ZGEMM_UNROLL_N
+
+#if M_BLOCKSIZE == 4
+#define RVV_MUL __riscv_vfmul_vf_f64m1
+#define RVV_LOAD __riscv_vlse64_v_f64m1
+#define RVV_STORE __riscv_vsse64_v_f64m1
+#define VECTOR_T vfloat64m1_t
+#elif M_BLOCKSIZE == 8
+#define RVV_MUL __riscv_vfmul_vf_f64m2
+#define RVV_LOAD __riscv_vlse64_v_f64m2
+#define RVV_STORE __riscv_vsse64_v_f64m2
+#define VECTOR_T vfloat64m2_t
+#elif M_BLOCKSIZE == 16
+#define RVV_MUL __riscv_vfmul_vf_f64m4
+#define RVV_LOAD __riscv_vlse64_v_f64m4
+#define RVV_STORE __riscv_vsse64_v_f64m4
+#define VECTOR_T vfloat64m4_t
+#else
+#error "Unsupported M_BLOCKSIZE value"
+#endif
+
+#include "zgemm_kernel_rvv_vlv_common.h"
diff --git a/kernel/riscv64/zgemm_tcopy_16_rvv_max.c b/kernel/riscv64/zgemm_tcopy_16_rvv_max.c
new file mode 100644
index 0000000000..8545989257
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_16_rvv_max.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define FLOAT_V_T               vfloat32m4_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m4
+#define VSEV_FLOAT              __riscv_vse32_v_f32m4
+#else
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#endif
+
+#define BLOCKSIZE 16
+
+#include "zgemm_tcopy_rvv_max_common.h"
diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv_max.c b/kernel/riscv64/zgemm_tcopy_4_rvv_max.c
new file mode 100644
index 0000000000..07b86a4862
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_4_rvv_max.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#else
+#define FLOAT_V_T               vfloat64m4_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VSEV_FLOAT              __riscv_vse64_v_f64m4
+#endif
+
+#define BLOCKSIZE 4
+
+#include "zgemm_tcopy_rvv_max_common.h"
diff --git a/kernel/riscv64/zgemm_tcopy_8_rvv_max.c b/kernel/riscv64/zgemm_tcopy_8_rvv_max.c
new file mode 100644
index 0000000000..53b28a5652
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_8_rvv_max.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#else
+#define FLOAT_V_T               vfloat64m4_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VSEV_FLOAT              __riscv_vse64_v_f64m4
+#endif
+
+#define BLOCKSIZE 8
+
+#include "zgemm_tcopy_rvv_max_common.h"
diff --git a/kernel/riscv64/zgemm_tcopy_rvv_max_common.h b/kernel/riscv64/zgemm_tcopy_rvv_max_common.h
new file mode 100644
index 0000000000..c0417a5e37
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_rvv_max_common.h
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2025, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define xstr(s) str(s)
+#define str(s) #s
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    IFLOAT *aoffset = a;
+    IFLOAT *boffset = b;
+    IFLOAT *boffset2  = b + m  * (n & ~(BLOCKSIZE - 1)) * 2;
+
+    //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", xstr(CNAME), m, n, lda);
+
+    for (BLASLONG j = m; j > 0; j--) {
+        IFLOAT *aoffset1 = aoffset;
+        IFLOAT *boffset1 = boffset;
+
+        aoffset += lda * 2;
+        boffset += BLOCKSIZE * 2;
+
+        for (BLASLONG i = n / BLOCKSIZE; i > 0; i--) {
+            size_t vl = BLOCKSIZE;
+
+            FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset1, v, vl);
+            v = VLEV_FLOAT(aoffset1 + vl, vl);
+            VSEV_FLOAT(boffset1 + vl, v, vl);
+
+            aoffset1 += BLOCKSIZE * 2;
+            boffset1 += BLOCKSIZE * m * 2;
+        }
+
+        if (n & (BLOCKSIZE - 1)) {
+            size_t vl = n & (BLOCKSIZE - 1);
+
+            FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset2, v, vl);
+            v = VLEV_FLOAT(aoffset1 + vl, vl);
+            VSEV_FLOAT(boffset2 + vl, v, vl);
+
+            boffset2 += vl * 2;
+        }
+    }
+
+    return 0;
+}

From 320e2d9331532e66ca890179a17a44e32670399a Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung <kcyeung@baylibre.com>
Date: Wed, 3 Dec 2025 20:12:21 +0000
Subject: [PATCH 3/6] Replace original RISC-V ZVL kernels with new kernels

The original kernels have been deleted with this commit.
---
 kernel/riscv64/KERNEL.RISCV64_ZVL128B      |   24 +-
 kernel/riscv64/KERNEL.RISCV64_ZVL256B      |   25 +-
 kernel/riscv64/cgemm_kernel_8x4_zvl128b.c  |  996 ----------
 kernel/riscv64/cgemm_kernel_8x8_zvl256b.c  | 1931 --------------------
 kernel/riscv64/dgemm_kernel_8x4_zvl128b.c  |  492 -----
 kernel/riscv64/dgemm_kernel_8x8_zvl256b.c  |  860 ---------
 kernel/riscv64/sgemm_kernel_16x8_zvl256b.c | 1081 -----------
 kernel/riscv64/sgemm_kernel_8x8_zvl128b.c  |  791 --------
 kernel/riscv64/zgemm_kernel_4x4_zvl128b.c  |  720 --------
 kernel/riscv64/zgemm_kernel_8x4_zvl256b.c  | 1253 -------------
 10 files changed, 25 insertions(+), 8148 deletions(-)
 delete mode 100644 kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
 delete mode 100644 kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
 delete mode 100644 kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
 delete mode 100644 kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
 delete mode 100644 kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
 delete mode 100644 kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
 delete mode 100644 kernel/riscv64/zgemm_kernel_4x4_zvl128b.c
 delete mode 100644 kernel/riscv64/zgemm_kernel_8x4_zvl256b.c

diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B
index ad7db5622e..2f8013eeed 100644
--- a/kernel/riscv64/KERNEL.RISCV64_ZVL128B
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B
@@ -95,10 +95,10 @@ DGEMVTKERNEL = gemv_t_rvv.c
 CGEMVTKERNEL = zgemv_t_rvv.c
 ZGEMVTKERNEL = zgemv_t_rvv.c
 
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c
+SGEMMKERNEL    =  sgemm_kernel_zvl128b.c
 ifneq ($(filter $(SGEMM_UNROLL_N),4 8 16),)
 SGEMMONCOPY    =  gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c
-SGEMMOTCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c
+SGEMMOTCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_N)_rvv_max.c
 else
 SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
@@ -109,7 +109,7 @@ SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 ifneq ($(filter $(SGEMM_UNROLL_M),4 8 16),)
 SGEMMINCOPY    =  gemm_ncopy_$(SGEMM_UNROLL_M)_rvv.c
-SGEMMITCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_M)_rvv.c
+SGEMMITCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_M)_rvv_max.c
 else
 SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
@@ -118,10 +118,10 @@ SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c
+DGEMMKERNEL    =  dgemm_kernel_zvl128b.c
 DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
 ifneq ($(filter $(DGEMM_UNROLL_N),4 8 16),)
-DGEMMOTCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c
+DGEMMOTCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_N)_rvv_max.c
 else
 DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
 endif
@@ -131,7 +131,7 @@ DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
 DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
 ifneq ($(filter $(DGEMM_UNROLL_M),4 8 16),)
-DGEMMITCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_M)_rvv.c
+DGEMMITCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_M)_rvv_max.c
 else
 DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
 endif
@@ -139,28 +139,28 @@ DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
 DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c
+CGEMMKERNEL    =  cgemm_kernel_zvl128b.c
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  zgemm_tcopy_$(CGEMM_UNROLL_N)_rvv_max.c
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
 CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  zgemm_tcopy_$(CGEMM_UNROLL_M)_rvv_max.c
 CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c
+ZGEMMKERNEL    =  zgemm_kernel_zvl128b.c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  zgemm_tcopy_$(ZGEMM_UNROLL_N)_rvv_max.c
 ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
 ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  zgemm_tcopy_$(ZGEMM_UNROLL_M)_rvv_max.c
 ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
index c48095bb21..650169d3e5 100644
--- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
@@ -95,10 +95,10 @@ DTRMMKERNEL	= dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
 CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
 ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
 
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
+SGEMMKERNEL    =  sgemm_kernel_zvl256b.c
 ifneq ($(filter $(SGEMM_UNROLL_N),4 8 16),)
 SGEMMONCOPY    =  gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c
-SGEMMOTCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c
+SGEMMOTCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_N)_rvv_max.c
 else
 SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
@@ -108,7 +108,7 @@ SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 ifneq ($(filter $(SGEMM_UNROLL_M),4 8 16),)
 SGEMMINCOPY    =  gemm_ncopy_$(SGEMM_UNROLL_M)_rvv.c
-SGEMMITCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_M)_rvv.c
+SGEMMITCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_M)_rvv_max.c
 else
 SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
@@ -117,19 +117,20 @@ SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
+DGEMMKERNEL    =  dgemm_kernel_zvl256b.c
 DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
 ifneq ($(filter $(DGEMM_UNROLL_N),4 8 16),)
-DGEMMOTCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c
+DGEMMOTCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_N)_rvv_max.c
 else
 DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
 endif
+
 DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
 DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
 ifneq ($(filter $(DGEMM_UNROLL_M),4 8 16),)
-DGEMMITCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_M)_rvv.c
+DGEMMITCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_M)_rvv_max.c
 else
 DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
 endif
@@ -137,28 +138,28 @@ DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
 DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
+CGEMMKERNEL    =  cgemm_kernel_zvl256b.c
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  zgemm_tcopy_$(CGEMM_UNROLL_N)_rvv_max.c
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
 CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  zgemm_tcopy_$(CGEMM_UNROLL_M)_rvv_max.c
 CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
+ZGEMMKERNEL    =  zgemm_kernel_zvl256b.c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  zgemm_tcopy_$(ZGEMM_UNROLL_N)_rvv_max.c
 ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
 ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  zgemm_tcopy_$(ZGEMM_UNROLL_M)_rvv_max.c
 ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
diff --git a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
deleted file mode 100644
index bd615389c8..0000000000
--- a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
+++ /dev/null
@@ -1,996 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Script: ./kernel/riscv64/generate_kernel.py
-Settings:
- LMUL=2
- M=8
- M_tail_scalar_from=2
- N=4
- __riscv_='__riscv_'
- complex=True
- conjugate=False
- cpu='zvl128b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='float'
- reg_width_bits=128
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=32
- ELEN_PARAM=32
- LMUL_ACC=2
- VFMACC='__riscv_vfmacc_vf_f32m2'
- VFMUL='__riscv_vfmul_vf_f32m2'
- VLEV='__riscv_vle32_v_f32m2'
- VLSEV='__riscv_vlse32_v_f32m2'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
- VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
- VSETVL='__riscv_vsetvl_e32m2'
- VSEV='__riscv_vse32_v_f32m2'
- VSSEV='__riscv_vsse32_v_f32m2'
- acc_vector_t='vfloat32m2_t'
- output='cgemm_kernel_8x4_zvl128b.c'
- param_scalar_t='float'
- param_vector_t='vfloat32m2_t'
-
-*/
-
-#include "common.h"
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
-#define S0 1
-#define S1 -1
-#define S2 1
-#define S3 1
-#define VFMACC_RR __riscv_vfmsac
-#define VFMACC_RI __riscv_vfmacc
-#endif
-#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
-#define S0 1
-#define S1 1
-#define S2 1
-#define S3 -1
-#define VFMACC_RR __riscv_vfmacc
-#define VFMACC_RI __riscv_vfmsac
-#endif
-#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
-#define S0 1
-#define S1 1
-#define S2 -1
-#define S3 1
-#define VFMACC_RR __riscv_vfmacc
-#define VFMACC_RI __riscv_vfnmsac
-#endif
-#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
-#define S0 1
-#define S1 -1
-#define S2 -1
-#define S3 -1
-#define VFMACC_RR __riscv_vfmsac
-#define VFMACC_RI __riscv_vfnmacc
-#endif
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-    // -- MAIN PASS
-
-    for (BLASLONG j = 0; j < N / 4; j += 1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            float B0r = B[bi + 0 * 2 + 0];
-            float B0i = B[bi + 0 * 2 + 1];
-            float B1r = B[bi + 1 * 2 + 0];
-            float B1i = B[bi + 1 * 2 + 1];
-            float B2r = B[bi + 2 * 2 + 0];
-            float B2i = B[bi + 2 * 2 + 1];
-            float B3r = B[bi + 3 * 2 + 0];
-            float B3i = B[bi + 3 * 2 + 1];
-            bi += 4 * 2;
-
-            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 8 * 2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 6 vector registers for temporaries
-            // performing 2 operations between reuses of temporaries
-            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-            vfloat32m2_t ACC0r = tmp0r;
-            vfloat32m2_t ACC0i = tmp0i;
-            vfloat32m2_t ACC1r = tmp1r;
-            vfloat32m2_t ACC1i = tmp1i;
-            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
-            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
-            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
-            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
-            vfloat32m2_t ACC2r = tmp0r;
-            vfloat32m2_t ACC2i = tmp0i;
-            vfloat32m2_t ACC3r = tmp1r;
-            vfloat32m2_t ACC3i = tmp1i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                B1r = B[bi + 1 * 2 + 0];
-                B1i = B[bi + 1 * 2 + 1];
-                B2r = B[bi + 2 * 2 + 0];
-                B2i = B[bi + 2 * 2 + 1];
-                B3r = B[bi + 3 * 2 + 0];
-                B3i = B[bi + 3 * 2 + 1];
-                bi += 4 * 2;
-
-                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 8 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
-                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
-                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
-                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
-                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
-            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
-            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
-            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
-            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
-            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
-            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
-            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
-            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
-            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
-            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
-
-            m_top += 8;
-        }
-
-        // -- tails for main pass
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            float B0r = B[bi + 0 * 2 + 0];
-            float B0i = B[bi + 0 * 2 + 1];
-            float B1r = B[bi + 1 * 2 + 0];
-            float B1i = B[bi + 1 * 2 + 1];
-            float B2r = B[bi + 2 * 2 + 0];
-            float B2i = B[bi + 2 * 2 + 1];
-            float B3r = B[bi + 3 * 2 + 0];
-            float B3i = B[bi + 3 * 2 + 1];
-            bi += 4 * 2;
-
-            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 4 * 2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 6 vector registers for temporaries
-            // performing 2 operations between reuses of temporaries
-            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-            vfloat32m2_t ACC0r = tmp0r;
-            vfloat32m2_t ACC0i = tmp0i;
-            vfloat32m2_t ACC1r = tmp1r;
-            vfloat32m2_t ACC1i = tmp1i;
-            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
-            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
-            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
-            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
-            vfloat32m2_t ACC2r = tmp0r;
-            vfloat32m2_t ACC2i = tmp0i;
-            vfloat32m2_t ACC3r = tmp1r;
-            vfloat32m2_t ACC3i = tmp1i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                B1r = B[bi + 1 * 2 + 0];
-                B1i = B[bi + 1 * 2 + 1];
-                B2r = B[bi + 2 * 2 + 0];
-                B2i = B[bi + 2 * 2 + 1];
-                B3r = B[bi + 3 * 2 + 0];
-                B3i = B[bi + 3 * 2 + 1];
-                bi += 4 * 2;
-
-                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 4 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
-                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
-                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
-                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
-                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
-            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
-            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
-            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
-            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
-            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
-            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
-            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
-            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
-            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
-            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
-
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            float result8 = 0;
-            float result9 = 0;
-            float result10 = 0;
-            float result11 = 0;
-            float result12 = 0;
-            float result13 = 0;
-            float result14 = 0;
-            float result15 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
-                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
-                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
-                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
-                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
-                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
-                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
-                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
-                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
-                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
-                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
-                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
-                ai += 2 * 2;
-                bi += 4 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            float Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result4 * alphar;
-            Ci += result5 * alphar;
-            Cr -= result5 * alphai;
-            Ci += result4 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
-            Cr += result6 * alphar;
-            Ci += result7 * alphar;
-            Cr -= result7 * alphai;
-            Ci += result6 * alphai;
-            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
-            Cr += result8 * alphar;
-            Ci += result9 * alphar;
-            Cr -= result9 * alphai;
-            Ci += result8 * alphai;
-            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 2 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 2 * ldc + 1) * 2 + 1];
-            Cr += result10 * alphar;
-            Ci += result11 * alphar;
-            Cr -= result11 * alphai;
-            Ci += result10 * alphai;
-            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
-            Cr += result12 * alphar;
-            Ci += result13 * alphar;
-            Cr -= result13 * alphai;
-            Ci += result12 * alphai;
-            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 3 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 3 * ldc + 1) * 2 + 1];
-            Cr += result14 * alphar;
-            Ci += result15 * alphar;
-            Cr -= result15 * alphai;
-            Ci += result14 * alphai;
-            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
-                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
-                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
-                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
-                ai += 1 * 2;
-                bi += 4 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            float Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
-            Cr += result4 * alphar;
-            Ci += result5 * alphar;
-            Cr -= result5 * alphai;
-            Ci += result4 * alphai;
-            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
-            Cr += result6 * alphar;
-            Ci += result7 * alphar;
-            Cr -= result7 * alphai;
-            Ci += result6 * alphai;
-            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
-            m_top += 1;
-        }
-
-        n_top += 4;
-    }
-
-    // -- tails for N=2
-
-    if (N & 2) {
-        gvl = __riscv_vsetvl_e32m2(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            float B0r = B[bi + 0 * 2 + 0];
-            float B0i = B[bi + 0 * 2 + 1];
-            float B1r = B[bi + 1 * 2 + 0];
-            float B1i = B[bi + 1 * 2 + 1];
-            bi += 2 * 2;
-
-            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 8 * 2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 10 vector registers for temporaries
-            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-            vfloat32m2_t ACC0r = tmp0r;
-            vfloat32m2_t ACC0i = tmp0i;
-            vfloat32m2_t ACC1r = tmp1r;
-            vfloat32m2_t ACC1i = tmp1i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                B1r = B[bi + 1 * 2 + 0];
-                B1i = B[bi + 1 * 2 + 1];
-                bi += 2 * 2;
-
-                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 8 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
-            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
-            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
-
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            float B0r = B[bi + 0 * 2 + 0];
-            float B0i = B[bi + 0 * 2 + 1];
-            float B1r = B[bi + 1 * 2 + 0];
-            float B1i = B[bi + 1 * 2 + 1];
-            bi += 2 * 2;
-
-            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 4 * 2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 10 vector registers for temporaries
-            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-            vfloat32m2_t ACC0r = tmp0r;
-            vfloat32m2_t ACC0i = tmp0i;
-            vfloat32m2_t ACC1r = tmp1r;
-            vfloat32m2_t ACC1i = tmp1i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                B1r = B[bi + 1 * 2 + 0];
-                B1i = B[bi + 1 * 2 + 1];
-                bi += 2 * 2;
-
-                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 4 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
-            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
-            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
-
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
-                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
-                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
-                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
-                ai += 2 * 2;
-                bi += 2 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            float Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result4 * alphar;
-            Ci += result5 * alphar;
-            Cr -= result5 * alphai;
-            Ci += result4 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
-            Cr += result6 * alphar;
-            Ci += result7 * alphar;
-            Cr -= result7 * alphai;
-            Ci += result6 * alphai;
-            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                ai += 1 * 2;
-                bi += 2 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            float Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            m_top += 1;
-        }
-
-        n_top += 2;
-    }
-
-    // -- tails for N=1
-
-    if (N & 1) {
-        gvl = __riscv_vsetvl_e32m2(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            float B0r = B[bi + 0 * 2 + 0];
-            float B0i = B[bi + 0 * 2 + 1];
-            bi += 1 * 2;
-
-            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 8 * 2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 12 vector registers for temporaries
-            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            vfloat32m2_t ACC0r = tmp0r;
-            vfloat32m2_t ACC0i = tmp0i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                bi += 1 * 2;
-
-                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 8 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            float B0r = B[bi + 0 * 2 + 0];
-            float B0i = B[bi + 0 * 2 + 1];
-            bi += 1 * 2;
-
-            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 4 * 2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 12 vector registers for temporaries
-            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            vfloat32m2_t ACC0r = tmp0r;
-            vfloat32m2_t ACC0i = tmp0i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                bi += 1 * 2;
-
-                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 4 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
-                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
-                ai += 2 * 2;
-                bi += 1 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            float Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            float result1 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                ai += 1 * 2;
-                bi += 1 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            float Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            m_top += 1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
diff --git a/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
deleted file mode 100644
index 7980c029a4..0000000000
--- a/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
+++ /dev/null
@@ -1,1931 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Settings:
- LMUL=1
- M=8
- M_tail_scalar_from=1
- N=8
- __riscv_='__riscv_'
- complex=True
- conjugate=False
- cpu='zvl256b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='float'
- reg_width_bits=256
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=32
- ELEN_PARAM=32
- LMUL_ACC=1
- VFMACC='__riscv_vfmacc_vf_f32m1'
- VFMUL='__riscv_vfmul_vf_f32m1'
- VLEV='__riscv_vle32_v_f32m1'
- VLSEV='__riscv_vlse32_v_f32m1'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
- VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
- VSETVL='__riscv_vsetvl_e32m1'
- VSEV='__riscv_vse32_v_f32m1'
- VSSEV='__riscv_vsse32_v_f32m1'
- acc_vector_t='vfloat32m1_t'
- output='cgemm_kernel_8x8_zvl256b.c'
- param_scalar_t='float'
- param_vector_t='vfloat32m1_t'
-
-*/
-
-#include "common.h"
-
-
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-    #define S0  1
-    #define S1 -1
-    #define S2  1
-    #define S3  1
-    #define VFMACC_RR __riscv_vfmsac
-    #define VFMACC_RI __riscv_vfmacc
-#endif
-#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
-    #define S0  1
-    #define S1  1
-    #define S2  1
-    #define S3 -1
-    #define VFMACC_RR __riscv_vfmacc
-    #define VFMACC_RI __riscv_vfmsac
-#endif
-#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
-    #define S0  1
-    #define S1  1
-    #define S2 -1
-    #define S3  1
-    #define VFMACC_RR __riscv_vfmacc
-    #define VFMACC_RI __riscv_vfnmsac
-#endif
-#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
-    #define S0  1
-    #define S1 -1
-    #define S2 -1
-    #define S3 -1
-    #define VFMACC_RR __riscv_vfmsac
-    #define VFMACC_RI __riscv_vfnmacc
-#endif
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-
-    // -- MAIN PASS
-
-    for (BLASLONG j=0; j<N/8; j+=1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
-
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            float B2r = B[bi+2*2+0];
-            float B2i = B[bi+2*2+1];
-            float B3r = B[bi+3*2+0];
-            float B3i = B[bi+3*2+1];
-            float B4r = B[bi+4*2+0];
-            float B4i = B[bi+4*2+1];
-            float B5r = B[bi+5*2+0];
-            float B5i = B[bi+5*2+1];
-            float B6r = B[bi+6*2+0];
-            float B6i = B[bi+6*2+1];
-            float B7r = B[bi+7*2+0];
-            float B7i = B[bi+7*2+1];
-            bi += 8*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
-            // leaving 14 vector registers for temporaries
-            // performing 4 operations between reuses of temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-            vfloat32m1_t ACC2r = tmp2r;
-            vfloat32m1_t ACC2i = tmp2i;
-            vfloat32m1_t ACC3r = tmp3r;
-            vfloat32m1_t ACC3i = tmp3i;
-            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
-            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
-            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
-            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
-            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
-            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
-            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
-            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
-            vfloat32m1_t ACC4r = tmp0r;
-            vfloat32m1_t ACC4i = tmp0i;
-            vfloat32m1_t ACC5r = tmp1r;
-            vfloat32m1_t ACC5i = tmp1i;
-            vfloat32m1_t ACC6r = tmp2r;
-            vfloat32m1_t ACC6i = tmp2i;
-            vfloat32m1_t ACC7r = tmp3r;
-            vfloat32m1_t ACC7i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                B4r = B[bi+4*2+0];
-                B4i = B[bi+4*2+1];
-                B5r = B[bi+5*2+0];
-                B5i = B[bi+5*2+1];
-                B6r = B[bi+6*2+0];
-                B6i = B[bi+6*2+1];
-                B7r = B[bi+7*2+0];
-                B7i = B[bi+7*2+1];
-                bi += 8*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
-                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
-                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
-                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
-                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
-                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
-                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
-                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
-                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
-            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
-            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
-            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
-            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
-            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
-            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
-            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
-            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
-            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
-            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
-            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
-            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
-            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
-            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
-            
-            m_top += 8;
-        }
-
-
-
-        // -- tails for main pass
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            float B2r = B[bi+2*2+0];
-            float B2i = B[bi+2*2+1];
-            float B3r = B[bi+3*2+0];
-            float B3i = B[bi+3*2+1];
-            float B4r = B[bi+4*2+0];
-            float B4i = B[bi+4*2+1];
-            float B5r = B[bi+5*2+0];
-            float B5i = B[bi+5*2+1];
-            float B6r = B[bi+6*2+0];
-            float B6i = B[bi+6*2+1];
-            float B7r = B[bi+7*2+0];
-            float B7i = B[bi+7*2+1];
-            bi += 8*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
-            // leaving 14 vector registers for temporaries
-            // performing 4 operations between reuses of temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-            vfloat32m1_t ACC2r = tmp2r;
-            vfloat32m1_t ACC2i = tmp2i;
-            vfloat32m1_t ACC3r = tmp3r;
-            vfloat32m1_t ACC3i = tmp3i;
-            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
-            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
-            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
-            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
-            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
-            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
-            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
-            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
-            vfloat32m1_t ACC4r = tmp0r;
-            vfloat32m1_t ACC4i = tmp0i;
-            vfloat32m1_t ACC5r = tmp1r;
-            vfloat32m1_t ACC5i = tmp1i;
-            vfloat32m1_t ACC6r = tmp2r;
-            vfloat32m1_t ACC6i = tmp2i;
-            vfloat32m1_t ACC7r = tmp3r;
-            vfloat32m1_t ACC7i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                B4r = B[bi+4*2+0];
-                B4i = B[bi+4*2+1];
-                B5r = B[bi+5*2+0];
-                B5i = B[bi+5*2+1];
-                B6r = B[bi+6*2+0];
-                B6i = B[bi+6*2+1];
-                B7r = B[bi+7*2+0];
-                B7i = B[bi+7*2+1];
-                bi += 8*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
-                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
-                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
-                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
-                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
-                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
-                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
-                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
-                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
-            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
-            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
-            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
-            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
-            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
-            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
-            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
-            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
-            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
-            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
-            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
-            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
-            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
-            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e32m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            float B2r = B[bi+2*2+0];
-            float B2i = B[bi+2*2+1];
-            float B3r = B[bi+3*2+0];
-            float B3i = B[bi+3*2+1];
-            float B4r = B[bi+4*2+0];
-            float B4i = B[bi+4*2+1];
-            float B5r = B[bi+5*2+0];
-            float B5i = B[bi+5*2+1];
-            float B6r = B[bi+6*2+0];
-            float B6i = B[bi+6*2+1];
-            float B7r = B[bi+7*2+0];
-            float B7i = B[bi+7*2+1];
-            bi += 8*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
-            // leaving 14 vector registers for temporaries
-            // performing 4 operations between reuses of temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-            vfloat32m1_t ACC2r = tmp2r;
-            vfloat32m1_t ACC2i = tmp2i;
-            vfloat32m1_t ACC3r = tmp3r;
-            vfloat32m1_t ACC3i = tmp3i;
-            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
-            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
-            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
-            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
-            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
-            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
-            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
-            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
-            vfloat32m1_t ACC4r = tmp0r;
-            vfloat32m1_t ACC4i = tmp0i;
-            vfloat32m1_t ACC5r = tmp1r;
-            vfloat32m1_t ACC5i = tmp1i;
-            vfloat32m1_t ACC6r = tmp2r;
-            vfloat32m1_t ACC6i = tmp2i;
-            vfloat32m1_t ACC7r = tmp3r;
-            vfloat32m1_t ACC7i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                B4r = B[bi+4*2+0];
-                B4i = B[bi+4*2+1];
-                B5r = B[bi+5*2+0];
-                B5i = B[bi+5*2+1];
-                B6r = B[bi+6*2+0];
-                B6i = B[bi+6*2+1];
-                B7r = B[bi+7*2+0];
-                B7i = B[bi+7*2+1];
-                bi += 8*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
-                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
-                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
-                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
-                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
-                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
-                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
-                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
-                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
-            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
-            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
-            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
-            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
-            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
-            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
-            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
-            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
-            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
-            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
-            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
-            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
-            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
-            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            float result8 = 0;
-            float result9 = 0;
-            float result10 = 0;
-            float result11 = 0;
-            float result12 = 0;
-            float result13 = 0;
-            float result14 = 0;
-            float result15 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
-                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
-                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
-                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
-                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
-                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
-                result8+=S0*A[ai+0+0]*B[bi+8+0] + S1*A[ai+0+1]*B[bi+8+1];
-                result9+=S2*A[ai+0+1]*B[bi+8+0] + S3*A[ai+0+0]*B[bi+8+1];
-                result10+=S0*A[ai+0+0]*B[bi+10+0] + S1*A[ai+0+1]*B[bi+10+1];
-                result11+=S2*A[ai+0+1]*B[bi+10+0] + S3*A[ai+0+0]*B[bi+10+1];
-                result12+=S0*A[ai+0+0]*B[bi+12+0] + S1*A[ai+0+1]*B[bi+12+1];
-                result13+=S2*A[ai+0+1]*B[bi+12+0] + S3*A[ai+0+0]*B[bi+12+1];
-                result14+=S0*A[ai+0+0]*B[bi+14+0] + S1*A[ai+0+1]*B[bi+14+1];
-                result15+=S2*A[ai+0+1]*B[bi+14+0] + S3*A[ai+0+0]*B[bi+14+1];
-                ai+=1*2;
-                bi+=8*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            float Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+1*ldc+0)*2+0];
-            Ci = C[(ci+1*ldc+0)*2+1];
-            Cr += result2*alphar;
-            Ci += result3*alphar;
-            Cr -= result3*alphai;
-            Ci += result2*alphai;
-            C[(ci+1*ldc+0)*2+0] = Cr;
-            C[(ci+1*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+2*ldc+0)*2+0];
-            Ci = C[(ci+2*ldc+0)*2+1];
-            Cr += result4*alphar;
-            Ci += result5*alphar;
-            Cr -= result5*alphai;
-            Ci += result4*alphai;
-            C[(ci+2*ldc+0)*2+0] = Cr;
-            C[(ci+2*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+3*ldc+0)*2+0];
-            Ci = C[(ci+3*ldc+0)*2+1];
-            Cr += result6*alphar;
-            Ci += result7*alphar;
-            Cr -= result7*alphai;
-            Ci += result6*alphai;
-            C[(ci+3*ldc+0)*2+0] = Cr;
-            C[(ci+3*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+4*ldc+0)*2+0];
-            Ci = C[(ci+4*ldc+0)*2+1];
-            Cr += result8*alphar;
-            Ci += result9*alphar;
-            Cr -= result9*alphai;
-            Ci += result8*alphai;
-            C[(ci+4*ldc+0)*2+0] = Cr;
-            C[(ci+4*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+5*ldc+0)*2+0];
-            Ci = C[(ci+5*ldc+0)*2+1];
-            Cr += result10*alphar;
-            Ci += result11*alphar;
-            Cr -= result11*alphai;
-            Ci += result10*alphai;
-            C[(ci+5*ldc+0)*2+0] = Cr;
-            C[(ci+5*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+6*ldc+0)*2+0];
-            Ci = C[(ci+6*ldc+0)*2+1];
-            Cr += result12*alphar;
-            Ci += result13*alphar;
-            Cr -= result13*alphai;
-            Ci += result12*alphai;
-            C[(ci+6*ldc+0)*2+0] = Cr;
-            C[(ci+6*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+7*ldc+0)*2+0];
-            Ci = C[(ci+7*ldc+0)*2+1];
-            Cr += result14*alphar;
-            Ci += result15*alphar;
-            Cr -= result15*alphai;
-            Ci += result14*alphai;
-            C[(ci+7*ldc+0)*2+0] = Cr;
-            C[(ci+7*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 8;
-    }
-
-
-
-    // -- tails for N=4
-
-    if( N & 4 ) {
-        gvl = __riscv_vsetvl_e32m1(8);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            float B2r = B[bi+2*2+0];
-            float B2i = B[bi+2*2+1];
-            float B3r = B[bi+3*2+0];
-            float B3i = B[bi+3*2+1];
-            bi += 4*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 22 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-            vfloat32m1_t ACC2r = tmp2r;
-            vfloat32m1_t ACC2i = tmp2i;
-            vfloat32m1_t ACC3r = tmp3r;
-            vfloat32m1_t ACC3i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                bi += 4*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            float B2r = B[bi+2*2+0];
-            float B2i = B[bi+2*2+1];
-            float B3r = B[bi+3*2+0];
-            float B3i = B[bi+3*2+1];
-            bi += 4*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 22 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-            vfloat32m1_t ACC2r = tmp2r;
-            vfloat32m1_t ACC2i = tmp2i;
-            vfloat32m1_t ACC3r = tmp3r;
-            vfloat32m1_t ACC3i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                bi += 4*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e32m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            float B2r = B[bi+2*2+0];
-            float B2i = B[bi+2*2+1];
-            float B3r = B[bi+3*2+0];
-            float B3i = B[bi+3*2+1];
-            bi += 4*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 22 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-            vfloat32m1_t ACC2r = tmp2r;
-            vfloat32m1_t ACC2i = tmp2i;
-            vfloat32m1_t ACC3r = tmp3r;
-            vfloat32m1_t ACC3i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                bi += 4*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
-                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
-                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
-                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
-                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
-                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
-                ai+=1*2;
-                bi+=4*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            float Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+1*ldc+0)*2+0];
-            Ci = C[(ci+1*ldc+0)*2+1];
-            Cr += result2*alphar;
-            Ci += result3*alphar;
-            Cr -= result3*alphai;
-            Ci += result2*alphai;
-            C[(ci+1*ldc+0)*2+0] = Cr;
-            C[(ci+1*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+2*ldc+0)*2+0];
-            Ci = C[(ci+2*ldc+0)*2+1];
-            Cr += result4*alphar;
-            Ci += result5*alphar;
-            Cr -= result5*alphai;
-            Ci += result4*alphai;
-            C[(ci+2*ldc+0)*2+0] = Cr;
-            C[(ci+2*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+3*ldc+0)*2+0];
-            Ci = C[(ci+3*ldc+0)*2+1];
-            Cr += result6*alphar;
-            Ci += result7*alphar;
-            Cr -= result7*alphai;
-            Ci += result6*alphai;
-            C[(ci+3*ldc+0)*2+0] = Cr;
-            C[(ci+3*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 4;
-    }
-
-
-
-    // -- tails for N=2
-
-    if( N & 2 ) {
-        gvl = __riscv_vsetvl_e32m1(8);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            bi += 2*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 26 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                bi += 2*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            bi += 2*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 26 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                bi += 2*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e32m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            float B1r = B[bi+1*2+0];
-            float B1i = B[bi+1*2+1];
-            bi += 2*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 26 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-            vfloat32m1_t ACC1r = tmp1r;
-            vfloat32m1_t ACC1i = tmp1i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                bi += 2*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
-                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
-                ai+=1*2;
-                bi+=2*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            float Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+1*ldc+0)*2+0];
-            Ci = C[(ci+1*ldc+0)*2+1];
-            Cr += result2*alphar;
-            Ci += result3*alphar;
-            Cr -= result3*alphai;
-            Ci += result2*alphai;
-            C[(ci+1*ldc+0)*2+0] = Cr;
-            C[(ci+1*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 2;
-    }
-
-
-
-    // -- tails for N=1
-
-    if( N & 1 ) {
-        gvl = __riscv_vsetvl_e32m1(8);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            bi += 1*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 28 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                bi += 1*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            bi += 1*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 28 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                bi += 1*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e32m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            float B0r = B[bi+0*2+0];
-            float B0i = B[bi+0*2+1];
-            bi += 1*2;
-
-            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 28 vector registers for temporaries
-            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            vfloat32m1_t ACC0r = tmp0r;
-            vfloat32m1_t ACC0i = tmp0i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                bi += 1*2;
-
-                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                ai+=1*2;
-                bi+=1*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            float Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
-
diff --git a/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
deleted file mode 100644
index a613f0bceb..0000000000
--- a/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
+++ /dev/null
@@ -1,492 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Script: ./kernel/riscv64/generate_kernel.py
-Settings:
- LMUL=4
- M=8
- M_tail_scalar_from=2
- N=4
- __riscv_='__riscv_'
- complex=False
- conjugate=False
- cpu='zvl128b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='double'
- reg_width_bits=128
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=64
- ELEN_PARAM=64
- LMUL_ACC=4
- VFMACC='__riscv_vfmacc_vf_f64m4'
- VFMUL='__riscv_vfmul_vf_f64m4'
- VLEV='__riscv_vle64_v_f64m4'
- VLSEV='__riscv_vlse64_v_f64m4'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4'
- VMUL_TO_ACC='__riscv_vfmul_vf_f64m4'
- VSETVL='__riscv_vsetvl_e64m4'
- VSEV='__riscv_vse64_v_f64m4'
- VSSEV='__riscv_vsse64_v_f64m4'
- acc_vector_t='vfloat64m4_t'
- output='dgemm_kernel_8x4_zvl128b.c'
- param_scalar_t='double'
- param_vector_t='vfloat64m4_t'
-
-*/
-
-#include "common.h"
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-    // -- MAIN PASS
-
-    for (BLASLONG j = 0; j < N / 4; j += 1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e64m4(8);
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            double B0 = B[bi + 0];
-            double B1 = B[bi + 1];
-            double B2 = B[bi + 2];
-            double B3 = B[bi + 3];
-            bi += 4;
-
-            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
-            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
-            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
-            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                B2 = B[bi + 2];
-                B3 = B[bi + 3];
-                bi += 4;
-
-                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
-            m_top += 8;
-        }
-
-        // -- tails for main pass
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e64m4(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            double B0 = B[bi + 0];
-            double B1 = B[bi + 1];
-            double B2 = B[bi + 2];
-            double B3 = B[bi + 3];
-            bi += 4;
-
-            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
-            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
-            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
-            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                B2 = B[bi + 2];
-                B3 = B[bi + 3];
-                bi += 4;
-
-                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                result2 += A[ai + 0] * B[bi + 1];
-                result3 += A[ai + 1] * B[bi + 1];
-                result4 += A[ai + 0] * B[bi + 2];
-                result5 += A[ai + 1] * B[bi + 2];
-                result6 += A[ai + 0] * B[bi + 3];
-                result7 += A[ai + 1] * B[bi + 3];
-                ai += 2;
-                bi += 4;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            C[ci + 1 * ldc + 0] += alpha * result2;
-            C[ci + 1 * ldc + 1] += alpha * result3;
-            C[ci + 2 * ldc + 0] += alpha * result4;
-            C[ci + 2 * ldc + 1] += alpha * result5;
-            C[ci + 3 * ldc + 0] += alpha * result6;
-            C[ci + 3 * ldc + 1] += alpha * result7;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 0] * B[bi + 1];
-                result2 += A[ai + 0] * B[bi + 2];
-                result3 += A[ai + 0] * B[bi + 3];
-                ai += 1;
-                bi += 4;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 1 * ldc + 0] += alpha * result1;
-            C[ci + 2 * ldc + 0] += alpha * result2;
-            C[ci + 3 * ldc + 0] += alpha * result3;
-            m_top += 1;
-        }
-
-        n_top += 4;
-    }
-
-    // -- tails for N=2
-
-    if (N & 2) {
-        gvl = __riscv_vsetvl_e64m4(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            double B0 = B[bi + 0];
-            double B1 = B[bi + 1];
-            bi += 2;
-
-            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
-            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                bi += 2;
-
-                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e64m4(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            double B0 = B[bi + 0];
-            double B1 = B[bi + 1];
-            bi += 2;
-
-            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
-            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                bi += 2;
-
-                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                result2 += A[ai + 0] * B[bi + 1];
-                result3 += A[ai + 1] * B[bi + 1];
-                ai += 2;
-                bi += 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            C[ci + 1 * ldc + 0] += alpha * result2;
-            C[ci + 1 * ldc + 1] += alpha * result3;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            double result0 = 0;
-            double result1 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 0] * B[bi + 1];
-                ai += 1;
-                bi += 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 1 * ldc + 0] += alpha * result1;
-            m_top += 1;
-        }
-
-        n_top += 2;
-    }
-
-    // -- tails for N=1
-
-    if (N & 1) {
-        gvl = __riscv_vsetvl_e64m4(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            double B0 = B[bi + 0];
-            bi += 1;
-
-            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                bi += 1;
-
-                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e64m4(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            double B0 = B[bi + 0];
-            bi += 1;
-
-            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                bi += 1;
-
-                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            double result0 = 0;
-            double result1 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                ai += 2;
-                bi += 1;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            double result0 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                ai += 1;
-                bi += 1;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            m_top += 1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
diff --git a/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
deleted file mode 100644
index 760bfc8932..0000000000
--- a/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
+++ /dev/null
@@ -1,860 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Settings:
- LMUL=1
- M=8
- M_tail_scalar_from=2
- N=8
- __riscv_='__riscv_'
- complex=False
- conjugate=False
- cpu='zvl256b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='double'
- reg_width_bits=256
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=64
- ELEN_PARAM=64
- LMUL_ACC=1
- VFMACC='__riscv_vfmacc_vf_f64m1'
- VFMUL='__riscv_vfmul_vf_f64m1'
- VLEV='__riscv_vle64_v_f64m1'
- VLSEV='__riscv_vlse64_v_f64m1'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
- VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
- VSETVL='__riscv_vsetvl_e64m1'
- VSEV='__riscv_vse64_v_f64m1'
- VSSEV='__riscv_vsse64_v_f64m1'
- acc_vector_t='vfloat64m1_t'
- output='dgemm_kernel_8x8_zvl256b.c'
- param_scalar_t='double'
- param_vector_t='vfloat64m1_t'
-
-*/
-
-#include "common.h"
-
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-
-    // -- MAIN PASS
-
-    for (BLASLONG j=0; j<N/8; j+=1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
-
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            double B1 = B[bi+1];
-            double B2 = B[bi+2];
-            double B3 = B[bi+3];
-            double B4 = B[bi+4];
-            double B5 = B[bi+5];
-            double B6 = B[bi+6];
-            double B7 = B[bi+7];
-            bi += 8;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-            ai += 8;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
-            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
-            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
-            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
-            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
-            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
-            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
-            vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
-            vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl);
-            vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
-            vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl);
-            vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
-            vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl);
-            vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
-            vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                B4 = B[bi+4];
-                B5 = B[bi+5];
-                B6 = B[bi+6];
-                B7 = B[bi+7];
-                bi += 8;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
-                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
-                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
-                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
-                result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl);
-                result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl);
-                result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl);
-                result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl);
-                result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl);
-                result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl);
-                result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl);
-                result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c8 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c9 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c10 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c11 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c12 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c13 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c14 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c15 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
-            c8 = __riscv_vfmacc_vf_f64m1( c8, alpha, result8, gvl );
-            c9 = __riscv_vfmacc_vf_f64m1( c9, alpha, result9, gvl );
-            c10 = __riscv_vfmacc_vf_f64m1( c10, alpha, result10, gvl );
-            c11 = __riscv_vfmacc_vf_f64m1( c11, alpha, result11, gvl );
-            c12 = __riscv_vfmacc_vf_f64m1( c12, alpha, result12, gvl );
-            c13 = __riscv_vfmacc_vf_f64m1( c13, alpha, result13, gvl );
-            c14 = __riscv_vfmacc_vf_f64m1( c14, alpha, result14, gvl );
-            c15 = __riscv_vfmacc_vf_f64m1( c15, alpha, result15, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c15, gvl);
-            m_top += 8;
-        }
-
-
-
-        // -- tails for main pass
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            double B1 = B[bi+1];
-            double B2 = B[bi+2];
-            double B3 = B[bi+3];
-            double B4 = B[bi+4];
-            double B5 = B[bi+5];
-            double B6 = B[bi+6];
-            double B7 = B[bi+7];
-            bi += 8;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
-            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
-            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
-            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
-            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
-            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
-            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                B4 = B[bi+4];
-                B5 = B[bi+5];
-                B6 = B[bi+6];
-                B7 = B[bi+7];
-                bi += 8;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
-                result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl);
-                result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            double result8 = 0;
-            double result9 = 0;
-            double result10 = 0;
-            double result11 = 0;
-            double result12 = 0;
-            double result13 = 0;
-            double result14 = 0;
-            double result15 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                result2+=A[ai+0]*B[bi+1];
-                result3+=A[ai+1]*B[bi+1];
-                result4+=A[ai+0]*B[bi+2];
-                result5+=A[ai+1]*B[bi+2];
-                result6+=A[ai+0]*B[bi+3];
-                result7+=A[ai+1]*B[bi+3];
-                result8+=A[ai+0]*B[bi+4];
-                result9+=A[ai+1]*B[bi+4];
-                result10+=A[ai+0]*B[bi+5];
-                result11+=A[ai+1]*B[bi+5];
-                result12+=A[ai+0]*B[bi+6];
-                result13+=A[ai+1]*B[bi+6];
-                result14+=A[ai+0]*B[bi+7];
-                result15+=A[ai+1]*B[bi+7];
-                ai+=2;
-                bi+=8;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            C[ci+1*ldc+0] += alpha * result2;
-            C[ci+1*ldc+1] += alpha * result3;
-            C[ci+2*ldc+0] += alpha * result4;
-            C[ci+2*ldc+1] += alpha * result5;
-            C[ci+3*ldc+0] += alpha * result6;
-            C[ci+3*ldc+1] += alpha * result7;
-            C[ci+4*ldc+0] += alpha * result8;
-            C[ci+4*ldc+1] += alpha * result9;
-            C[ci+5*ldc+0] += alpha * result10;
-            C[ci+5*ldc+1] += alpha * result11;
-            C[ci+6*ldc+0] += alpha * result12;
-            C[ci+6*ldc+1] += alpha * result13;
-            C[ci+7*ldc+0] += alpha * result14;
-            C[ci+7*ldc+1] += alpha * result15;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+0]*B[bi+1];
-                result2+=A[ai+0]*B[bi+2];
-                result3+=A[ai+0]*B[bi+3];
-                result4+=A[ai+0]*B[bi+4];
-                result5+=A[ai+0]*B[bi+5];
-                result6+=A[ai+0]*B[bi+6];
-                result7+=A[ai+0]*B[bi+7];
-                ai+=1;
-                bi+=8;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+1*ldc+0] += alpha * result1;
-            C[ci+2*ldc+0] += alpha * result2;
-            C[ci+3*ldc+0] += alpha * result3;
-            C[ci+4*ldc+0] += alpha * result4;
-            C[ci+5*ldc+0] += alpha * result5;
-            C[ci+6*ldc+0] += alpha * result6;
-            C[ci+7*ldc+0] += alpha * result7;
-            m_top+=1;
-        }
-
-        n_top += 8;
-    }
-
-
-
-    // -- tails for N=4
-
-    if( N & 4 ) {
-        gvl = __riscv_vsetvl_e64m1(4);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            double B1 = B[bi+1];
-            double B2 = B[bi+2];
-            double B3 = B[bi+3];
-            bi += 4;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-            ai += 8;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
-            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
-            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
-            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
-            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
-            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
-            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                bi += 4;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
-                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
-                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
-                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            double B1 = B[bi+1];
-            double B2 = B[bi+2];
-            double B3 = B[bi+3];
-            bi += 4;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
-            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
-            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                bi += 4;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                result2+=A[ai+0]*B[bi+1];
-                result3+=A[ai+1]*B[bi+1];
-                result4+=A[ai+0]*B[bi+2];
-                result5+=A[ai+1]*B[bi+2];
-                result6+=A[ai+0]*B[bi+3];
-                result7+=A[ai+1]*B[bi+3];
-                ai+=2;
-                bi+=4;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            C[ci+1*ldc+0] += alpha * result2;
-            C[ci+1*ldc+1] += alpha * result3;
-            C[ci+2*ldc+0] += alpha * result4;
-            C[ci+2*ldc+1] += alpha * result5;
-            C[ci+3*ldc+0] += alpha * result6;
-            C[ci+3*ldc+1] += alpha * result7;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+0]*B[bi+1];
-                result2+=A[ai+0]*B[bi+2];
-                result3+=A[ai+0]*B[bi+3];
-                ai+=1;
-                bi+=4;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+1*ldc+0] += alpha * result1;
-            C[ci+2*ldc+0] += alpha * result2;
-            C[ci+3*ldc+0] += alpha * result3;
-            m_top+=1;
-        }
-
-        n_top += 4;
-    }
-
-
-
-    // -- tails for N=2
-
-    if( N & 2 ) {
-        gvl = __riscv_vsetvl_e64m1(4);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            double B1 = B[bi+1];
-            bi += 2;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-            ai += 8;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
-            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
-            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                bi += 2;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
-                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
-            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            double B1 = B[bi+1];
-            bi += 2;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                bi += 2;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                result2+=A[ai+0]*B[bi+1];
-                result3+=A[ai+1]*B[bi+1];
-                ai+=2;
-                bi+=2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            C[ci+1*ldc+0] += alpha * result2;
-            C[ci+1*ldc+1] += alpha * result3;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            double result1 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+0]*B[bi+1];
-                ai+=1;
-                bi+=2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+1*ldc+0] += alpha * result1;
-            m_top+=1;
-        }
-
-        n_top += 2;
-    }
-
-
-
-    // -- tails for N=1
-
-    if( N & 1 ) {
-        gvl = __riscv_vsetvl_e64m1(4);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            bi += 1;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-            ai += 8;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                bi += 1;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
-            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            double B0 = B[bi+0];
-            bi += 1;
-
-            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                bi += 1;
-
-                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse64_v_f64m1( &C[ci], c0, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            double result0 = 0;
-            double result1 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                ai+=2;
-                bi+=1;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                ai+=1;
-                bi+=1;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            m_top+=1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
-
diff --git a/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
deleted file mode 100644
index e22df34f99..0000000000
--- a/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
+++ /dev/null
@@ -1,1081 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Settings:
- LMUL=1
- M=16
- M_tail_scalar_from=2
- N=8
- __riscv_='__riscv_'
- complex=False
- conjugate=False
- cpu='zvl256b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='float'
- reg_width_bits=256
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=32
- ELEN_PARAM=32
- LMUL_ACC=1
- VFMACC='__riscv_vfmacc_vf_f32m1'
- VFMUL='__riscv_vfmul_vf_f32m1'
- VLEV='__riscv_vle32_v_f32m1'
- VLSEV='__riscv_vlse32_v_f32m1'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
- VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
- VSETVL='__riscv_vsetvl_e32m1'
- VSEV='__riscv_vse32_v_f32m1'
- VSSEV='__riscv_vsse32_v_f32m1'
- acc_vector_t='vfloat32m1_t'
- output='sgemm_kernel_16x8_zvl256b.c'
- param_scalar_t='float'
- param_vector_t='vfloat32m1_t'
-
-*/
-
-#include "common.h"
-
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-
-    // -- MAIN PASS
-
-    for (BLASLONG j=0; j<N/8; j+=1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
-
-
-        for (BLASLONG i=0; i<M/16; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            float B2 = B[bi+2];
-            float B3 = B[bi+3];
-            float B4 = B[bi+4];
-            float B5 = B[bi+5];
-            float B6 = B[bi+6];
-            float B7 = B[bi+7];
-            bi += 8;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-            ai += 16;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
-            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
-            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
-            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
-            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
-            vfloat32m1_t result8 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
-            vfloat32m1_t result9 = __riscv_vfmul_vf_f32m1( A1, B4, gvl);
-            vfloat32m1_t result10 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
-            vfloat32m1_t result11 = __riscv_vfmul_vf_f32m1( A1, B5, gvl);
-            vfloat32m1_t result12 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
-            vfloat32m1_t result13 = __riscv_vfmul_vf_f32m1( A1, B6, gvl);
-            vfloat32m1_t result14 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
-            vfloat32m1_t result15 = __riscv_vfmul_vf_f32m1( A1, B7, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                B4 = B[bi+4];
-                B5 = B[bi+5];
-                B6 = B[bi+6];
-                B7 = B[bi+7];
-                bi += 8;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-                ai += 16;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
-                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
-                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
-                result8 = __riscv_vfmacc_vf_f32m1( result8, B4, A0, gvl);
-                result9 = __riscv_vfmacc_vf_f32m1( result9, B4, A1, gvl);
-                result10 = __riscv_vfmacc_vf_f32m1( result10, B5, A0, gvl);
-                result11 = __riscv_vfmacc_vf_f32m1( result11, B5, A1, gvl);
-                result12 = __riscv_vfmacc_vf_f32m1( result12, B6, A0, gvl);
-                result13 = __riscv_vfmacc_vf_f32m1( result13, B6, A1, gvl);
-                result14 = __riscv_vfmacc_vf_f32m1( result14, B7, A0, gvl);
-                result15 = __riscv_vfmacc_vf_f32m1( result15, B7, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c8 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c9 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c10 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c11 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c12 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c13 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c14 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c15 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
-            c8 = __riscv_vfmacc_vf_f32m1( c8, alpha, result8, gvl );
-            c9 = __riscv_vfmacc_vf_f32m1( c9, alpha, result9, gvl );
-            c10 = __riscv_vfmacc_vf_f32m1( c10, alpha, result10, gvl );
-            c11 = __riscv_vfmacc_vf_f32m1( c11, alpha, result11, gvl );
-            c12 = __riscv_vfmacc_vf_f32m1( c12, alpha, result12, gvl );
-            c13 = __riscv_vfmacc_vf_f32m1( c13, alpha, result13, gvl );
-            c14 = __riscv_vfmacc_vf_f32m1( c14, alpha, result14, gvl );
-            c15 = __riscv_vfmacc_vf_f32m1( c15, alpha, result15, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c8, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c10, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c12, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c14, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c15, gvl);
-            m_top += 16;
-        }
-
-
-
-        // -- tails for main pass
-
-        if( M & 8 ) {
-            gvl = __riscv_vsetvl_e32m1(8);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            float B2 = B[bi+2];
-            float B3 = B[bi+3];
-            float B4 = B[bi+4];
-            float B5 = B[bi+5];
-            float B6 = B[bi+6];
-            float B7 = B[bi+7];
-            bi += 8;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 8;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
-            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
-            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
-            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
-            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                B4 = B[bi+4];
-                B5 = B[bi+5];
-                B6 = B[bi+6];
-                B7 = B[bi+7];
-                bi += 8;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
-                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
-                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            float B2 = B[bi+2];
-            float B3 = B[bi+3];
-            float B4 = B[bi+4];
-            float B5 = B[bi+5];
-            float B6 = B[bi+6];
-            float B7 = B[bi+7];
-            bi += 8;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
-            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
-            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
-            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
-            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                B4 = B[bi+4];
-                B5 = B[bi+5];
-                B6 = B[bi+6];
-                B7 = B[bi+7];
-                bi += 8;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
-                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
-                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            float result8 = 0;
-            float result9 = 0;
-            float result10 = 0;
-            float result11 = 0;
-            float result12 = 0;
-            float result13 = 0;
-            float result14 = 0;
-            float result15 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                result2+=A[ai+0]*B[bi+1];
-                result3+=A[ai+1]*B[bi+1];
-                result4+=A[ai+0]*B[bi+2];
-                result5+=A[ai+1]*B[bi+2];
-                result6+=A[ai+0]*B[bi+3];
-                result7+=A[ai+1]*B[bi+3];
-                result8+=A[ai+0]*B[bi+4];
-                result9+=A[ai+1]*B[bi+4];
-                result10+=A[ai+0]*B[bi+5];
-                result11+=A[ai+1]*B[bi+5];
-                result12+=A[ai+0]*B[bi+6];
-                result13+=A[ai+1]*B[bi+6];
-                result14+=A[ai+0]*B[bi+7];
-                result15+=A[ai+1]*B[bi+7];
-                ai+=2;
-                bi+=8;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            C[ci+1*ldc+0] += alpha * result2;
-            C[ci+1*ldc+1] += alpha * result3;
-            C[ci+2*ldc+0] += alpha * result4;
-            C[ci+2*ldc+1] += alpha * result5;
-            C[ci+3*ldc+0] += alpha * result6;
-            C[ci+3*ldc+1] += alpha * result7;
-            C[ci+4*ldc+0] += alpha * result8;
-            C[ci+4*ldc+1] += alpha * result9;
-            C[ci+5*ldc+0] += alpha * result10;
-            C[ci+5*ldc+1] += alpha * result11;
-            C[ci+6*ldc+0] += alpha * result12;
-            C[ci+6*ldc+1] += alpha * result13;
-            C[ci+7*ldc+0] += alpha * result14;
-            C[ci+7*ldc+1] += alpha * result15;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+0]*B[bi+1];
-                result2+=A[ai+0]*B[bi+2];
-                result3+=A[ai+0]*B[bi+3];
-                result4+=A[ai+0]*B[bi+4];
-                result5+=A[ai+0]*B[bi+5];
-                result6+=A[ai+0]*B[bi+6];
-                result7+=A[ai+0]*B[bi+7];
-                ai+=1;
-                bi+=8;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+1*ldc+0] += alpha * result1;
-            C[ci+2*ldc+0] += alpha * result2;
-            C[ci+3*ldc+0] += alpha * result3;
-            C[ci+4*ldc+0] += alpha * result4;
-            C[ci+5*ldc+0] += alpha * result5;
-            C[ci+6*ldc+0] += alpha * result6;
-            C[ci+7*ldc+0] += alpha * result7;
-            m_top+=1;
-        }
-
-        n_top += 8;
-    }
-
-
-
-    // -- tails for N=4
-
-    if( N & 4 ) {
-        gvl = __riscv_vsetvl_e32m1(8);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/16; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            float B2 = B[bi+2];
-            float B3 = B[bi+3];
-            bi += 4;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-            ai += 16;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
-            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
-            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
-            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
-            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                bi += 4;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-                ai += 16;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
-                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
-                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
-            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
-            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
-            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
-            m_top += 16;
-        }
-
-
-        if( M & 8 ) {
-            gvl = __riscv_vsetvl_e32m1(8);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            float B2 = B[bi+2];
-            float B3 = B[bi+3];
-            bi += 4;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 8;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                bi += 4;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            float B2 = B[bi+2];
-            float B3 = B[bi+3];
-            bi += 4;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                B2 = B[bi+2];
-                B3 = B[bi+3];
-                bi += 4;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                result2+=A[ai+0]*B[bi+1];
-                result3+=A[ai+1]*B[bi+1];
-                result4+=A[ai+0]*B[bi+2];
-                result5+=A[ai+1]*B[bi+2];
-                result6+=A[ai+0]*B[bi+3];
-                result7+=A[ai+1]*B[bi+3];
-                ai+=2;
-                bi+=4;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            C[ci+1*ldc+0] += alpha * result2;
-            C[ci+1*ldc+1] += alpha * result3;
-            C[ci+2*ldc+0] += alpha * result4;
-            C[ci+2*ldc+1] += alpha * result5;
-            C[ci+3*ldc+0] += alpha * result6;
-            C[ci+3*ldc+1] += alpha * result7;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+0]*B[bi+1];
-                result2+=A[ai+0]*B[bi+2];
-                result3+=A[ai+0]*B[bi+3];
-                ai+=1;
-                bi+=4;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+1*ldc+0] += alpha * result1;
-            C[ci+2*ldc+0] += alpha * result2;
-            C[ci+3*ldc+0] += alpha * result3;
-            m_top+=1;
-        }
-
-        n_top += 4;
-    }
-
-
-
-    // -- tails for N=2
-
-    if( N & 2 ) {
-        gvl = __riscv_vsetvl_e32m1(8);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/16; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            bi += 2;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-            ai += 16;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
-            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                bi += 2;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-                ai += 16;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
-                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
-            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
-            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
-            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
-            m_top += 16;
-        }
-
-
-        if( M & 8 ) {
-            gvl = __riscv_vsetvl_e32m1(8);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            bi += 2;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 8;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                bi += 2;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            float B1 = B[bi+1];
-            bi += 2;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                B1 = B[bi+1];
-                bi += 2;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                result2+=A[ai+0]*B[bi+1];
-                result3+=A[ai+1]*B[bi+1];
-                ai+=2;
-                bi+=2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            C[ci+1*ldc+0] += alpha * result2;
-            C[ci+1*ldc+1] += alpha * result3;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            float result1 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+0]*B[bi+1];
-                ai+=1;
-                bi+=2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+1*ldc+0] += alpha * result1;
-            m_top+=1;
-        }
-
-        n_top += 2;
-    }
-
-
-
-    // -- tails for N=1
-
-    if( N & 1 ) {
-        gvl = __riscv_vsetvl_e32m1(8);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/16; i+=1) {
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            bi += 1;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-            ai += 16;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                bi += 1;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
-                ai += 16;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
-            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
-            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
-            m_top += 16;
-        }
-
-
-        if( M & 8 ) {
-            gvl = __riscv_vsetvl_e32m1(8);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            bi += 1;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 8;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                bi += 1;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e32m1(4);
-
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-            float B0 = B[bi+0];
-            bi += 1;
-
-            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-            ai += 4;
-
-            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0 = B[bi+0];
-                bi += 1;
-
-                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            float result0 = 0;
-            float result1 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                result1+=A[ai+1]*B[bi+0];
-                ai+=2;
-                bi+=1;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            C[ci+0*ldc+1] += alpha * result1;
-            m_top+=2;
-        }
-
-
-        if( M & 1 ) {
-            float result0 = 0;
-            BLASLONG ai=m_top*K;
-            BLASLONG bi=n_top*K;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=A[ai+0]*B[bi+0];
-                ai+=1;
-                bi+=1;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            C[ci+0*ldc+0] += alpha * result0;
-            m_top+=1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
-
diff --git a/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
deleted file mode 100644
index ad720e6949..0000000000
--- a/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
+++ /dev/null
@@ -1,791 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Script: ./kernel/riscv64/generate_kernel.py
-Settings:
- LMUL=2
- M=8
- M_tail_scalar_from=2
- N=8
- __riscv_='__riscv_'
- complex=False
- conjugate=False
- cpu='zvl128b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='float'
- reg_width_bits=128
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=32
- ELEN_PARAM=32
- LMUL_ACC=2
- VFMACC='__riscv_vfmacc_vf_f32m2'
- VFMUL='__riscv_vfmul_vf_f32m2'
- VLEV='__riscv_vle32_v_f32m2'
- VLSEV='__riscv_vlse32_v_f32m2'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
- VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
- VSETVL='__riscv_vsetvl_e32m2'
- VSEV='__riscv_vse32_v_f32m2'
- VSSEV='__riscv_vsse32_v_f32m2'
- acc_vector_t='vfloat32m2_t'
- output='sgemm_kernel_8x8_zvl128b.c'
- param_scalar_t='float'
- param_vector_t='vfloat32m2_t'
-
-*/
-
-#include "common.h"
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-    // -- MAIN PASS
-
-    for (BLASLONG j = 0; j < N / 8; j += 1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            float B1 = B[bi + 1];
-            float B2 = B[bi + 2];
-            float B3 = B[bi + 3];
-            float B4 = B[bi + 4];
-            float B5 = B[bi + 5];
-            float B6 = B[bi + 6];
-            float B7 = B[bi + 7];
-            bi += 8;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
-            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
-            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
-            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
-            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
-            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
-            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                B2 = B[bi + 2];
-                B3 = B[bi + 3];
-                B4 = B[bi + 4];
-                B5 = B[bi + 5];
-                B6 = B[bi + 6];
-                B7 = B[bi + 7];
-                bi += 8;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
-                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
-                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
-            c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
-            c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
-            c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
-            c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
-            m_top += 8;
-        }
-
-        // -- tails for main pass
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            float B1 = B[bi + 1];
-            float B2 = B[bi + 2];
-            float B3 = B[bi + 3];
-            float B4 = B[bi + 4];
-            float B5 = B[bi + 5];
-            float B6 = B[bi + 6];
-            float B7 = B[bi + 7];
-            bi += 8;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
-            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
-            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
-            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
-            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
-            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
-            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                B2 = B[bi + 2];
-                B3 = B[bi + 3];
-                B4 = B[bi + 4];
-                B5 = B[bi + 5];
-                B6 = B[bi + 6];
-                B7 = B[bi + 7];
-                bi += 8;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
-                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
-                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
-                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
-                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
-            c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
-            c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
-            c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
-            c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            float result8 = 0;
-            float result9 = 0;
-            float result10 = 0;
-            float result11 = 0;
-            float result12 = 0;
-            float result13 = 0;
-            float result14 = 0;
-            float result15 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                result2 += A[ai + 0] * B[bi + 1];
-                result3 += A[ai + 1] * B[bi + 1];
-                result4 += A[ai + 0] * B[bi + 2];
-                result5 += A[ai + 1] * B[bi + 2];
-                result6 += A[ai + 0] * B[bi + 3];
-                result7 += A[ai + 1] * B[bi + 3];
-                result8 += A[ai + 0] * B[bi + 4];
-                result9 += A[ai + 1] * B[bi + 4];
-                result10 += A[ai + 0] * B[bi + 5];
-                result11 += A[ai + 1] * B[bi + 5];
-                result12 += A[ai + 0] * B[bi + 6];
-                result13 += A[ai + 1] * B[bi + 6];
-                result14 += A[ai + 0] * B[bi + 7];
-                result15 += A[ai + 1] * B[bi + 7];
-                ai += 2;
-                bi += 8;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            C[ci + 1 * ldc + 0] += alpha * result2;
-            C[ci + 1 * ldc + 1] += alpha * result3;
-            C[ci + 2 * ldc + 0] += alpha * result4;
-            C[ci + 2 * ldc + 1] += alpha * result5;
-            C[ci + 3 * ldc + 0] += alpha * result6;
-            C[ci + 3 * ldc + 1] += alpha * result7;
-            C[ci + 4 * ldc + 0] += alpha * result8;
-            C[ci + 4 * ldc + 1] += alpha * result9;
-            C[ci + 5 * ldc + 0] += alpha * result10;
-            C[ci + 5 * ldc + 1] += alpha * result11;
-            C[ci + 6 * ldc + 0] += alpha * result12;
-            C[ci + 6 * ldc + 1] += alpha * result13;
-            C[ci + 7 * ldc + 0] += alpha * result14;
-            C[ci + 7 * ldc + 1] += alpha * result15;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 0] * B[bi + 1];
-                result2 += A[ai + 0] * B[bi + 2];
-                result3 += A[ai + 0] * B[bi + 3];
-                result4 += A[ai + 0] * B[bi + 4];
-                result5 += A[ai + 0] * B[bi + 5];
-                result6 += A[ai + 0] * B[bi + 6];
-                result7 += A[ai + 0] * B[bi + 7];
-                ai += 1;
-                bi += 8;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 1 * ldc + 0] += alpha * result1;
-            C[ci + 2 * ldc + 0] += alpha * result2;
-            C[ci + 3 * ldc + 0] += alpha * result3;
-            C[ci + 4 * ldc + 0] += alpha * result4;
-            C[ci + 5 * ldc + 0] += alpha * result5;
-            C[ci + 6 * ldc + 0] += alpha * result6;
-            C[ci + 7 * ldc + 0] += alpha * result7;
-            m_top += 1;
-        }
-
-        n_top += 8;
-    }
-
-    // -- tails for N=4
-
-    if (N & 4) {
-        gvl = __riscv_vsetvl_e32m2(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            float B1 = B[bi + 1];
-            float B2 = B[bi + 2];
-            float B3 = B[bi + 3];
-            bi += 4;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
-            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
-            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                B2 = B[bi + 2];
-                B3 = B[bi + 3];
-                bi += 4;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            float B1 = B[bi + 1];
-            float B2 = B[bi + 2];
-            float B3 = B[bi + 3];
-            bi += 4;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
-            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
-            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                B2 = B[bi + 2];
-                B3 = B[bi + 3];
-                bi += 4;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
-                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
-                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
-            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            float result4 = 0;
-            float result5 = 0;
-            float result6 = 0;
-            float result7 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                result2 += A[ai + 0] * B[bi + 1];
-                result3 += A[ai + 1] * B[bi + 1];
-                result4 += A[ai + 0] * B[bi + 2];
-                result5 += A[ai + 1] * B[bi + 2];
-                result6 += A[ai + 0] * B[bi + 3];
-                result7 += A[ai + 1] * B[bi + 3];
-                ai += 2;
-                bi += 4;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            C[ci + 1 * ldc + 0] += alpha * result2;
-            C[ci + 1 * ldc + 1] += alpha * result3;
-            C[ci + 2 * ldc + 0] += alpha * result4;
-            C[ci + 2 * ldc + 1] += alpha * result5;
-            C[ci + 3 * ldc + 0] += alpha * result6;
-            C[ci + 3 * ldc + 1] += alpha * result7;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 0] * B[bi + 1];
-                result2 += A[ai + 0] * B[bi + 2];
-                result3 += A[ai + 0] * B[bi + 3];
-                ai += 1;
-                bi += 4;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 1 * ldc + 0] += alpha * result1;
-            C[ci + 2 * ldc + 0] += alpha * result2;
-            C[ci + 3 * ldc + 0] += alpha * result3;
-            m_top += 1;
-        }
-
-        n_top += 4;
-    }
-
-    // -- tails for N=2
-
-    if (N & 2) {
-        gvl = __riscv_vsetvl_e32m2(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            float B1 = B[bi + 1];
-            bi += 2;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                bi += 2;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            float B1 = B[bi + 1];
-            bi += 2;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                B1 = B[bi + 1];
-                bi += 2;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            ci += ldc - gvl * 0;
-            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            float result2 = 0;
-            float result3 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                result2 += A[ai + 0] * B[bi + 1];
-                result3 += A[ai + 1] * B[bi + 1];
-                ai += 2;
-                bi += 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            C[ci + 1 * ldc + 0] += alpha * result2;
-            C[ci + 1 * ldc + 1] += alpha * result3;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            float result1 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 0] * B[bi + 1];
-                ai += 1;
-                bi += 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 1 * ldc + 0] += alpha * result1;
-            m_top += 1;
-        }
-
-        n_top += 2;
-    }
-
-    // -- tails for N=1
-
-    if (N & 1) {
-        gvl = __riscv_vsetvl_e32m2(8);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 8; i += 1) {
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            bi += 1;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 8;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                bi += 1;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 8;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            m_top += 8;
-        }
-
-        if (M & 4) {
-            gvl = __riscv_vsetvl_e32m2(4);
-
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-            float B0 = B[bi + 0];
-            bi += 1;
-
-            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-            ai += 4;
-
-            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0 = B[bi + 0];
-                bi += 1;
-
-                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
-                ai += 4;
-
-                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
-            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            float result0 = 0;
-            float result1 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                result1 += A[ai + 1] * B[bi + 0];
-                ai += 2;
-                bi += 1;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            C[ci + 0 * ldc + 1] += alpha * result1;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            float result0 = 0;
-            BLASLONG ai = m_top * K;
-            BLASLONG bi = n_top * K;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += A[ai + 0] * B[bi + 0];
-                ai += 1;
-                bi += 1;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            C[ci + 0 * ldc + 0] += alpha * result0;
-            m_top += 1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
diff --git a/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c
deleted file mode 100644
index 0776f03fdd..0000000000
--- a/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c
+++ /dev/null
@@ -1,720 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Script: ./kernel/riscv64/generate_kernel.py
-Settings:
- LMUL=2
- M=4
- M_tail_scalar_from=2
- N=4
- __riscv_='__riscv_'
- complex=True
- conjugate=False
- cpu='zvl128b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='double'
- reg_width_bits=128
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=64
- ELEN_PARAM=64
- LMUL_ACC=2
- VFMACC='__riscv_vfmacc_vf_f64m2'
- VFMUL='__riscv_vfmul_vf_f64m2'
- VLEV='__riscv_vle64_v_f64m2'
- VLSEV='__riscv_vlse64_v_f64m2'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2'
- VMUL_TO_ACC='__riscv_vfmul_vf_f64m2'
- VSETVL='__riscv_vsetvl_e64m2'
- VSEV='__riscv_vse64_v_f64m2'
- VSSEV='__riscv_vsse64_v_f64m2'
- acc_vector_t='vfloat64m2_t'
- output='zgemm_kernel_4x4_zvl128b.c'
- param_scalar_t='double'
- param_vector_t='vfloat64m2_t'
-
-*/
-
-#include "common.h"
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
-#define S0 1
-#define S1 -1
-#define S2 1
-#define S3 1
-#define VFMACC_RR __riscv_vfmsac
-#define VFMACC_RI __riscv_vfmacc
-#endif
-#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
-#define S0 1
-#define S1 1
-#define S2 1
-#define S3 -1
-#define VFMACC_RR __riscv_vfmacc
-#define VFMACC_RI __riscv_vfmsac
-#endif
-#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
-#define S0 1
-#define S1 1
-#define S2 -1
-#define S3 1
-#define VFMACC_RR __riscv_vfmacc
-#define VFMACC_RI __riscv_vfnmsac
-#endif
-#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
-#define S0 1
-#define S1 -1
-#define S2 -1
-#define S3 -1
-#define VFMACC_RR __riscv_vfmsac
-#define VFMACC_RI __riscv_vfnmacc
-#endif
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-    // -- MAIN PASS
-
-    for (BLASLONG j = 0; j < N / 4; j += 1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e64m2(4);
-
-        for (BLASLONG i = 0; i < M / 4; i += 1) {
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            double B0r = B[bi + 0 * 2 + 0];
-            double B0i = B[bi + 0 * 2 + 1];
-            double B1r = B[bi + 1 * 2 + 0];
-            double B1i = B[bi + 1 * 2 + 1];
-            double B2r = B[bi + 2 * 2 + 0];
-            double B2i = B[bi + 2 * 2 + 1];
-            double B3r = B[bi + 3 * 2 + 0];
-            double B3i = B[bi + 3 * 2 + 1];
-            bi += 4 * 2;
-
-            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 4 * 2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 6 vector registers for temporaries
-            // performing 2 operations between reuses of temporaries
-            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
-            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
-            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
-            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-            vfloat64m2_t ACC0r = tmp0r;
-            vfloat64m2_t ACC0i = tmp0i;
-            vfloat64m2_t ACC1r = tmp1r;
-            vfloat64m2_t ACC1i = tmp1i;
-            tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
-            tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
-            tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
-            tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
-            vfloat64m2_t ACC2r = tmp0r;
-            vfloat64m2_t ACC2i = tmp0i;
-            vfloat64m2_t ACC3r = tmp1r;
-            vfloat64m2_t ACC3i = tmp1i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                B1r = B[bi + 1 * 2 + 0];
-                B1i = B[bi + 1 * 2 + 1];
-                B2r = B[bi + 2 * 2 + 0];
-                B2i = B[bi + 2 * 2 + 1];
-                B3r = B[bi + 3 * 2 + 0];
-                B3i = B[bi + 3 * 2 + 1];
-                bi += 4 * 2;
-
-                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 4 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
-                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
-                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
-                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
-                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
-                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m2_t C2r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C2i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m2_t C3r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C3i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
-            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
-            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
-            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
-            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
-            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
-            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
-            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
-            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
-            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
-            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
-
-            m_top += 4;
-        }
-
-        // -- tails for main pass
-
-        if (M & 2) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            double result8 = 0;
-            double result9 = 0;
-            double result10 = 0;
-            double result11 = 0;
-            double result12 = 0;
-            double result13 = 0;
-            double result14 = 0;
-            double result15 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
-                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
-                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
-                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
-                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
-                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
-                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
-                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
-                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
-                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
-                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
-                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
-                ai += 2 * 2;
-                bi += 4 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            double Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result4 * alphar;
-            Ci += result5 * alphar;
-            Cr -= result5 * alphai;
-            Ci += result4 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
-            Cr += result6 * alphar;
-            Ci += result7 * alphar;
-            Cr -= result7 * alphai;
-            Ci += result6 * alphai;
-            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
-            Cr += result8 * alphar;
-            Ci += result9 * alphar;
-            Cr -= result9 * alphai;
-            Ci += result8 * alphai;
-            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 2 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 2 * ldc + 1) * 2 + 1];
-            Cr += result10 * alphar;
-            Ci += result11 * alphar;
-            Cr -= result11 * alphai;
-            Ci += result10 * alphai;
-            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
-            Cr += result12 * alphar;
-            Ci += result13 * alphar;
-            Cr -= result13 * alphai;
-            Ci += result12 * alphai;
-            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 3 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 3 * ldc + 1) * 2 + 1];
-            Cr += result14 * alphar;
-            Ci += result15 * alphar;
-            Cr -= result15 * alphai;
-            Ci += result14 * alphai;
-            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
-                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
-                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
-                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
-                ai += 1 * 2;
-                bi += 4 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            double Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
-            Cr += result4 * alphar;
-            Ci += result5 * alphar;
-            Cr -= result5 * alphai;
-            Ci += result4 * alphai;
-            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
-            Cr += result6 * alphar;
-            Ci += result7 * alphar;
-            Cr -= result7 * alphai;
-            Ci += result6 * alphai;
-            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
-            m_top += 1;
-        }
-
-        n_top += 4;
-    }
-
-    // -- tails for N=2
-
-    if (N & 2) {
-        gvl = __riscv_vsetvl_e64m2(4);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 4; i += 1) {
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            double B0r = B[bi + 0 * 2 + 0];
-            double B0i = B[bi + 0 * 2 + 1];
-            double B1r = B[bi + 1 * 2 + 0];
-            double B1i = B[bi + 1 * 2 + 1];
-            bi += 2 * 2;
-
-            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 4 * 2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 10 vector registers for temporaries
-            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
-            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
-            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
-            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-            vfloat64m2_t ACC0r = tmp0r;
-            vfloat64m2_t ACC0i = tmp0i;
-            vfloat64m2_t ACC1r = tmp1r;
-            vfloat64m2_t ACC1i = tmp1i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                B1r = B[bi + 1 * 2 + 0];
-                B1i = B[bi + 1 * 2 + 1];
-                bi += 2 * 2;
-
-                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 4 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ci += ldc - gvl * 0;
-            vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
-            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
-            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-            ci += ldc - gvl * 0;
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
-
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
-                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
-                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
-                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
-                ai += 2 * 2;
-                bi += 2 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            double Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result4 * alphar;
-            Ci += result5 * alphar;
-            Cr -= result5 * alphai;
-            Ci += result4 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
-            Cr += result6 * alphar;
-            Ci += result7 * alphar;
-            Cr -= result7 * alphai;
-            Ci += result6 * alphai;
-            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
-                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
-                ai += 1 * 2;
-                bi += 2 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            double Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
-            m_top += 1;
-        }
-
-        n_top += 2;
-    }
-
-    // -- tails for N=1
-
-    if (N & 1) {
-        gvl = __riscv_vsetvl_e64m2(4);
-        m_top = 0;
-
-        for (BLASLONG i = 0; i < M / 4; i += 1) {
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-            double B0r = B[bi + 0 * 2 + 0];
-            double B0i = B[bi + 0 * 2 + 1];
-            bi += 1 * 2;
-
-            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-            ai += 4 * 2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 12 vector registers for temporaries
-            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
-            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
-            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-            vfloat64m2_t ACC0r = tmp0r;
-            vfloat64m2_t ACC0i = tmp0i;
-
-            for (BLASLONG k = 1; k < K; k++) {
-                B0r = B[bi + 0 * 2 + 0];
-                B0i = B[bi + 0 * 2 + 1];
-                bi += 1 * 2;
-
-                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
-                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
-                ai += 4 * 2;
-
-                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
-                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-
-            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
-            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
-
-            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
-            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
-            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
-            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
-
-            ci = n_top * ldc + m_top;
-
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
-            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
-
-            m_top += 4;
-        }
-
-        if (M & 2) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
-                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
-                ai += 2 * 2;
-                bi += 1 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            double Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
-            Cr += result2 * alphar;
-            Ci += result3 * alphar;
-            Cr -= result3 * alphai;
-            Ci += result2 * alphai;
-            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
-            m_top += 2;
-        }
-
-        if (M & 1) {
-            double result0 = 0;
-            double result1 = 0;
-            BLASLONG ai = m_top * K * 2;
-            BLASLONG bi = n_top * K * 2;
-
-            for (BLASLONG k = 0; k < K; k++) {
-                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
-                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
-                ai += 1 * 2;
-                bi += 1 * 2;
-            }
-
-            BLASLONG ci = n_top * ldc + m_top;
-            double Cr, Ci;
-            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
-            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
-            Cr += result0 * alphar;
-            Ci += result1 * alphar;
-            Cr -= result1 * alphai;
-            Ci += result0 * alphai;
-            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
-            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
-            m_top += 1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
diff --git a/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c b/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c
deleted file mode 100644
index ca33368f00..0000000000
--- a/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c
+++ /dev/null
@@ -1,1253 +0,0 @@
-/*
-
-AUTOGENERATED KERNEL
-Settings:
- LMUL=1
- M=8
- M_tail_scalar_from=1
- N=4
- __riscv_='__riscv_'
- complex=True
- conjugate=False
- cpu='zvl256b'
- force_acc_double=False
- index_type='BLASLONG'
- op='gemm'
- param_precision='double'
- reg_width_bits=256
- tail_policy=''
- trace=False
-
-Derived:
- ELEN_ACC=64
- ELEN_PARAM=64
- LMUL_ACC=1
- VFMACC='__riscv_vfmacc_vf_f64m1'
- VFMUL='__riscv_vfmul_vf_f64m1'
- VLEV='__riscv_vle64_v_f64m1'
- VLSEV='__riscv_vlse64_v_f64m1'
- VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
- VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
- VSETVL='__riscv_vsetvl_e64m1'
- VSEV='__riscv_vse64_v_f64m1'
- VSSEV='__riscv_vsse64_v_f64m1'
- acc_vector_t='vfloat64m1_t'
- output='zgemm_kernel_8x4_zvl256b.c'
- param_scalar_t='double'
- param_vector_t='vfloat64m1_t'
-
-*/
-
-#include "common.h"
-
-
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-    #define S0  1
-    #define S1 -1
-    #define S2  1
-    #define S3  1
-    #define VFMACC_RR __riscv_vfmsac
-    #define VFMACC_RI __riscv_vfmacc
-#endif
-#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
-    #define S0  1
-    #define S1  1
-    #define S2  1
-    #define S3 -1
-    #define VFMACC_RR __riscv_vfmacc
-    #define VFMACC_RI __riscv_vfmsac
-#endif
-#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
-    #define S0  1
-    #define S1  1
-    #define S2 -1
-    #define S3  1
-    #define VFMACC_RR __riscv_vfmacc
-    #define VFMACC_RI __riscv_vfnmsac
-#endif
-#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
-    #define S0  1
-    #define S1 -1
-    #define S2 -1
-    #define S3 -1
-    #define VFMACC_RR __riscv_vfmsac
-    #define VFMACC_RI __riscv_vfnmacc
-#endif
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
-
-{
-    BLASLONG gvl = 0;
-    BLASLONG m_top = 0;
-    BLASLONG n_top = 0;
-
-
-    // -- MAIN PASS
-
-    for (BLASLONG j=0; j<N/4; j+=1) {
-        m_top = 0;
-        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
-
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            double B1r = B[bi+1*2+0];
-            double B1i = B[bi+1*2+1];
-            double B2r = B[bi+2*2+0];
-            double B2i = B[bi+2*2+1];
-            double B3r = B[bi+3*2+0];
-            double B3i = B[bi+3*2+1];
-            bi += 4*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 4 vector regs to hold A array contents, 16 regs to hold values accumulated over k
-            // leaving 12 vector registers for temporaries
-            // performing 4 operations between reuses of temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
-            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
-            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-            vfloat64m1_t ACC2r = tmp2r;
-            vfloat64m1_t ACC2i = tmp2i;
-            vfloat64m1_t ACC3r = tmp3r;
-            vfloat64m1_t ACC3i = tmp3i;
-            tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
-            tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
-            tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
-            tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
-            tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
-            tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
-            tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
-            tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
-            vfloat64m1_t ACC4r = tmp0r;
-            vfloat64m1_t ACC4i = tmp0i;
-            vfloat64m1_t ACC5r = tmp1r;
-            vfloat64m1_t ACC5i = tmp1i;
-            vfloat64m1_t ACC6r = tmp2r;
-            vfloat64m1_t ACC6i = tmp2i;
-            vfloat64m1_t ACC7r = tmp3r;
-            vfloat64m1_t ACC7i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                bi += 4*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
-                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
-                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
-                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
-                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
-                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
-                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
-                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
-                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
-                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
-                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
-                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
-                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
-                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
-                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*1;
-            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*1;
-            vfloat64m1_t C4r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C4i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C5r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C5i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*1;
-            vfloat64m1_t C6r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C6i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C7r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C7i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
-            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
-            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
-            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
-            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
-            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
-            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
-            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
-            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
-            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
-            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
-            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
-            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
-            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
-            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*1;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            ci += ldc-gvl*1;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
-            ci += ldc-gvl*1;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
-            
-            m_top += 8;
-        }
-
-
-
-        // -- tails for main pass
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            double B1r = B[bi+1*2+0];
-            double B1i = B[bi+1*2+1];
-            double B2r = B[bi+2*2+0];
-            double B2i = B[bi+2*2+1];
-            double B3r = B[bi+3*2+0];
-            double B3i = B[bi+3*2+1];
-            bi += 4*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 22 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
-            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
-            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
-            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-            vfloat64m1_t ACC2r = tmp2r;
-            vfloat64m1_t ACC2i = tmp2i;
-            vfloat64m1_t ACC3r = tmp3r;
-            vfloat64m1_t ACC3i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                bi += 4*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e64m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            double B1r = B[bi+1*2+0];
-            double B1i = B[bi+1*2+1];
-            double B2r = B[bi+2*2+0];
-            double B2i = B[bi+2*2+1];
-            double B3r = B[bi+3*2+0];
-            double B3i = B[bi+3*2+1];
-            bi += 4*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 22 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
-            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
-            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
-            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-            vfloat64m1_t ACC2r = tmp2r;
-            vfloat64m1_t ACC2i = tmp2i;
-            vfloat64m1_t ACC3r = tmp3r;
-            vfloat64m1_t ACC3i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                B2r = B[bi+2*2+0];
-                B2i = B[bi+2*2+1];
-                B3r = B[bi+3*2+0];
-                B3i = B[bi+3*2+1];
-                bi += 4*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
-                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
-                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
-                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            double result4 = 0;
-            double result5 = 0;
-            double result6 = 0;
-            double result7 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
-                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
-                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
-                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
-                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
-                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
-                ai+=1*2;
-                bi+=4*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            double Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+1*ldc+0)*2+0];
-            Ci = C[(ci+1*ldc+0)*2+1];
-            Cr += result2*alphar;
-            Ci += result3*alphar;
-            Cr -= result3*alphai;
-            Ci += result2*alphai;
-            C[(ci+1*ldc+0)*2+0] = Cr;
-            C[(ci+1*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+2*ldc+0)*2+0];
-            Ci = C[(ci+2*ldc+0)*2+1];
-            Cr += result4*alphar;
-            Ci += result5*alphar;
-            Cr -= result5*alphai;
-            Ci += result4*alphai;
-            C[(ci+2*ldc+0)*2+0] = Cr;
-            C[(ci+2*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+3*ldc+0)*2+0];
-            Ci = C[(ci+3*ldc+0)*2+1];
-            Cr += result6*alphar;
-            Ci += result7*alphar;
-            Cr -= result7*alphai;
-            Ci += result6*alphai;
-            C[(ci+3*ldc+0)*2+0] = Cr;
-            C[(ci+3*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 4;
-    }
-
-
-
-    // -- tails for N=2
-
-    if( N & 2 ) {
-        gvl = __riscv_vsetvl_e64m1(4);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            double B1r = B[bi+1*2+0];
-            double B1i = B[bi+1*2+1];
-            bi += 2*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 4 vector regs to hold A array contents, 8 regs to hold values accumulated over k
-            // leaving 20 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
-            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
-            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
-            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
-            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
-            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
-            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-            vfloat64m1_t ACC2r = tmp2r;
-            vfloat64m1_t ACC2i = tmp2i;
-            vfloat64m1_t ACC3r = tmp3r;
-            vfloat64m1_t ACC3i = tmp3i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                bi += 2*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
-                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
-                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
-                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
-                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
-                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
-                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
-                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
-                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
-                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
-                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*1;
-            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
-            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
-            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
-            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
-            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
-            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
-            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            ci += ldc-gvl*1;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
-            
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            double B1r = B[bi+1*2+0];
-            double B1i = B[bi+1*2+1];
-            bi += 2*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 26 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                bi += 2*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e64m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            double B1r = B[bi+1*2+0];
-            double B1i = B[bi+1*2+1];
-            bi += 2*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 26 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                B1r = B[bi+1*2+0];
-                B1i = B[bi+1*2+1];
-                bi += 2*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            ci += ldc-gvl*0;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            ci += ldc-gvl*0;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            double result1 = 0;
-            double result2 = 0;
-            double result3 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
-                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
-                ai+=1*2;
-                bi+=2*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            double Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            Cr = C[(ci+1*ldc+0)*2+0];
-            Ci = C[(ci+1*ldc+0)*2+1];
-            Cr += result2*alphar;
-            Ci += result3*alphar;
-            Cr -= result3*alphai;
-            Ci += result2*alphai;
-            C[(ci+1*ldc+0)*2+0] = Cr;
-            C[(ci+1*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 2;
-    }
-
-
-
-    // -- tails for N=1
-
-    if( N & 1 ) {
-        gvl = __riscv_vsetvl_e64m1(4);
-        m_top = 0;
-
-        for (BLASLONG i=0; i<M/8; i+=1) {
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            bi += 1*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 8*2;
-
-            // 4 vector regs to hold A array contents, 4 regs to hold values accumulated over k
-            // leaving 24 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
-            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
-            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-            vfloat64m1_t ACC1r = tmp1r;
-            vfloat64m1_t ACC1i = tmp1i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                bi += 1*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
-                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 8*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
-                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
-                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
-                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-             ci += gvl;
-            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
-            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
-            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-             ci += gvl;
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
-            
-            m_top += 8;
-        }
-
-
-        if( M & 4 ) {
-            gvl = __riscv_vsetvl_e64m1(4);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            bi += 1*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 4*2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 28 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                bi += 1*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 4*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            
-            m_top += 4;
-        }
-
-
-        if( M & 2 ) {
-            gvl = __riscv_vsetvl_e64m1(2);
-
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-            double B0r = B[bi+0*2+0];
-            double B0i = B[bi+0*2+1];
-            bi += 1*2;
-
-            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-            ai += 2*2;
-
-            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
-            // leaving 28 vector registers for temporaries
-            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-            vfloat64m1_t ACC0r = tmp0r;
-            vfloat64m1_t ACC0i = tmp0i;
-
-            for(BLASLONG k=1; k<K; k++) {
-                B0r = B[bi+0*2+0];
-                B0i = B[bi+0*2+1];
-                bi += 1*2;
-
-                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
-                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
-                ai += 2*2;
-
-                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
-                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
-                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
-                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
-                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
-                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
-            }
-
-
-            BLASLONG ci=n_top*ldc+m_top;
-
-            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
-            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
-            
-            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
-            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
-            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
-            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
-
-            ci=n_top*ldc+m_top;
-
-            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
-            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
-            
-            m_top += 2;
-        }
-
-
-        if( M & 1 ) {
-            double result0 = 0;
-            double result1 = 0;
-            BLASLONG ai=m_top*K*2;
-            BLASLONG bi=n_top*K*2;
-
-            for(BLASLONG k=0; k<K; k++) {
-                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
-                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
-                ai+=1*2;
-                bi+=1*2;
-            }
-
-            BLASLONG ci=n_top*ldc+m_top;
-            double Cr, Ci;
-            Cr = C[(ci+0*ldc+0)*2+0];
-            Ci = C[(ci+0*ldc+0)*2+1];
-            Cr += result0*alphar;
-            Ci += result1*alphar;
-            Cr -= result1*alphai;
-            Ci += result0*alphai;
-            C[(ci+0*ldc+0)*2+0] = Cr;
-            C[(ci+0*ldc+0)*2+1] = Ci;
-            m_top+=1;
-        }
-
-        n_top += 1;
-    }
-
-    return 0;
-}
-

From dbdfea8f72a251b325f266fbdb8ead577f38a6ba Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung <kcyeung@baylibre.com>
Date: Wed, 3 Dec 2025 21:02:48 +0000
Subject: [PATCH 4/6] Allow TRMM unroll sizes to be set independently from the
 GEMM unroll sizes for RISC-V

The unroll factor of the TRMM kernels are currently set to those of the
equivalent GEMM kernels.  As we are not dealing with the TRMM kernels for now,
I have added extra xTRMM_UNROLL_(M|N) parameters to allow them to be set
independently of the xGEMM_UNROLL_(M|N) parameters.

If the new TRMM parameter is not defined, then it defaults back to the
original behaviour of using the GEMM parameters.
---
 getarch_2nd.c                         | 48 +++++++++++++++++++++++++++
 kernel/riscv64/KERNEL.RISCV64_ZVL128B | 46 ++++++++++++-------------
 kernel/riscv64/KERNEL.RISCV64_ZVL256B |  8 ++---
 3 files changed, 75 insertions(+), 27 deletions(-)

diff --git a/getarch_2nd.c b/getarch_2nd.c
index 2085556bd6..744ad5ee7d 100644
--- a/getarch_2nd.c
+++ b/getarch_2nd.c
@@ -103,7 +103,55 @@ int main(int argc, char **argv) {
     printf("XGEMM3M_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N);
 #endif
 
+#ifdef ARCH_RISCV64
+#ifdef STRMM_DEFAULT_UNROLL_M
+    printf("STRMM_UNROLL_M=%d\n", STRMM_DEFAULT_UNROLL_M);
+#else
+    printf("STRMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
+#endif
+
+#ifdef STRMM_DEFAULT_UNROLL_N
+    printf("STRMM_UNROLL_N=%d\n", STRMM_DEFAULT_UNROLL_N);
+#else
+    printf("STRMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
+#endif
+
+#ifdef DTRMM_DEFAULT_UNROLL_M
+    printf("DTRMM_UNROLL_M=%d\n", DTRMM_DEFAULT_UNROLL_M);
+#else
+    printf("DTRMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
+#endif
+
+#ifdef DTRMM_DEFAULT_UNROLL_N
+    printf("DTRMM_UNROLL_N=%d\n", DTRMM_DEFAULT_UNROLL_N);
+#else
+    printf("DTRMM_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N);
+#endif
 
+#ifdef CTRMM_DEFAULT_UNROLL_M
+    printf("CTRMM_UNROLL_M=%d\n", CTRMM_DEFAULT_UNROLL_M);
+#else
+    printf("CTRMM_UNROLL_M=%d\n", CGEMM_DEFAULT_UNROLL_M);
+#endif
+
+#ifdef CTRMM_DEFAULT_UNROLL_N
+    printf("CTRMM_UNROLL_N=%d\n", CTRMM_DEFAULT_UNROLL_N);
+#else
+    printf("CTRMM_UNROLL_N=%d\n", CGEMM_DEFAULT_UNROLL_N);
+#endif
+
+#ifdef ZTRMM_DEFAULT_UNROLL_M
+    printf("ZTRMM_UNROLL_M=%d\n", ZTRMM_DEFAULT_UNROLL_M);
+#else
+    printf("ZTRMM_UNROLL_M=%d\n", ZGEMM_DEFAULT_UNROLL_M);
+#endif
+
+#ifdef ZTRMM_DEFAULT_UNROLL_N
+    printf("ZTRMM_UNROLL_N=%d\n", ZTRMM_DEFAULT_UNROLL_N);
+#else
+    printf("ZTRMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N);
+#endif
+#endif /* ARCH_RISCV64 */
   }
 
 
diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B
index 2f8013eeed..038d58e8f5 100644
--- a/kernel/riscv64/KERNEL.RISCV64_ZVL128B
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B
@@ -165,29 +165,29 @@ ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-STRMMKERNEL	   =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c
-STRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
-STRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
-STRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
-STRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
-
-DTRMMKERNEL	   =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c
-DTRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
-DTRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
-DTRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
-DTRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
-
-CTRMMKERNEL	   =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c
-CTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
-CTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
-CTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
-CTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
-
-ZTRMMKERNEL	   =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c
-ZTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
-ZTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
-ZTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
-ZTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
+STRMMKERNEL	   =  strmm_kernel_$(STRMM_UNROLL_M)x$(STRMM_UNROLL_N)_zvl128b.c
+STRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(STRMM_UNROLL_M).c
+STRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(STRMM_UNROLL_M).c
+STRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(STRMM_UNROLL_M).c
+STRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(STRMM_UNROLL_M).c
+
+DTRMMKERNEL	   =  dtrmm_kernel_$(DTRMM_UNROLL_M)x$(DTRMM_UNROLL_N)_zvl128b.c
+DTRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(DTRMM_UNROLL_M).c
+DTRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(DTRMM_UNROLL_M).c
+DTRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(DTRMM_UNROLL_M).c
+DTRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(DTRMM_UNROLL_M).c
+
+CTRMMKERNEL	   =  ctrmm_kernel_$(CTRMM_UNROLL_M)x$(CTRMM_UNROLL_N)_zvl128b.c
+CTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(CTRMM_UNROLL_M).c
+CTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(CTRMM_UNROLL_M).c
+CTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(CTRMM_UNROLL_M).c
+CTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(CTRMM_UNROLL_M).c
+
+ZTRMMKERNEL	   =  ztrmm_kernel_$(ZTRMM_UNROLL_M)x$(ZTRMM_UNROLL_N)_zvl128b.c
+ZTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(ZTRMM_UNROLL_M).c
+ZTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(ZTRMM_UNROLL_M).c
+ZTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(ZTRMM_UNROLL_M).c
+ZTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(ZTRMM_UNROLL_M).c
 
 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
index 650169d3e5..b95f7ab3e3 100644
--- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
@@ -90,10 +90,10 @@ DGEMVTKERNEL = gemv_t_vector.c
 CGEMVTKERNEL = zgemv_t_vector.c
 ZGEMVTKERNEL = zgemv_t_vector.c
 
-STRMMKERNEL	= strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
-DTRMMKERNEL	= dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
-CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
-ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
+STRMMKERNEL	= strmm_kernel_$(STRMM_UNROLL_M)x$(STRMM_UNROLL_N)_zvl256b.c
+DTRMMKERNEL	= dtrmm_kernel_$(DTRMM_UNROLL_M)x$(DTRMM_UNROLL_N)_zvl256b.c
+CTRMMKERNEL = ctrmm_kernel_$(CTRMM_UNROLL_M)x$(CTRMM_UNROLL_N)_zvl256b.c
+ZTRMMKERNEL = ztrmm_kernel_$(ZTRMM_UNROLL_M)x$(ZTRMM_UNROLL_N)_zvl256b.c
 
 SGEMMKERNEL    =  sgemm_kernel_zvl256b.c
 ifneq ($(filter $(SGEMM_UNROLL_N),4 8 16),)

From 38d528c7fd6279847b3f0719be1cbdfa41f8cc0c Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung <kcyeung@baylibre.com>
Date: Wed, 3 Dec 2025 22:06:08 +0000
Subject: [PATCH 5/6] Change DGEMM tiling size from 8x8 to 4x8 for the RISC-V
 ZVL256B architecture

Testing has shown that 4x8 or 4x4 performs better than the original 8x8 tiling
size for this kernel.

As we do not wish to perturb the behaviour of the DTRMM kernel at this point,
the DTRMM tiling size is explicitly set to the original 8x8.
---
 cmake/prebuild.cmake | 2 +-
 param.h              | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index 9320af9224..cfc8a893be 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -1475,7 +1475,7 @@ endif ()
       "#define L2_ASSOCIATIVE 4\n")
     set(SGEMM_UNROLL_M 16)
     set(SGEMM_UNROLL_N 8)
-    set(DGEMM_UNROLL_M 8)
+    set(DGEMM_UNROLL_M 4)
     set(DGEMM_UNROLL_N 8)
     set(CGEMM_UNROLL_M 8)
     set(CGEMM_UNROLL_N 8)
diff --git a/param.h b/param.h
index 8e598d8a01..8caa997c14 100644
--- a/param.h
+++ b/param.h
@@ -3231,7 +3231,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  8
 
-#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_M  4
 #define DGEMM_DEFAULT_UNROLL_N  8
 
 #define CGEMM_DEFAULT_UNROLL_M  8
@@ -3240,6 +3240,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_M  8
 #define ZGEMM_DEFAULT_UNROLL_N  4
 
+#define DTRMM_DEFAULT_UNROLL_M  8
+#define DTRMM_DEFAULT_UNROLL_N  8
+
 #undef SHGEMM_DEFAULT_P
 #define SHGEMM_DEFAULT_P 128
 #undef SBGEMM_DEFAULT_P

From 9b30a9a8af43449b1d263aadfe491b5f837f94b7 Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung <kcyeung@baylibre.com>
Date: Fri, 5 Dec 2025 12:21:08 +0000
Subject: [PATCH 6/6] Make new RISC-V kernels work with DYNAMIC_ARCH enabled

If DYNAMIC_ARCH is enabled, then the various xGEMM_UNROLL_(M|N) macros
expand to 'gotoblas-><parameter>', which prevent their use in macro
conditionals as the value is only defined at runtime.

To get around this, we use the xGEMM_UNROLL_(M|N)_DEFAULT macros instead,
which should expand to a compile-time constant.  As the tiling factors
used in the tcopy/ncopy functions are selected by these at compile time
as well, it does not result in reduced functionality as changing the
tiling at runtime without changing tcopy/ncopy would result in incorrect
results anyway.
---
 kernel/riscv64/cgemm_kernel_zvl128b.c | 4 ++--
 kernel/riscv64/cgemm_kernel_zvl256b.c | 4 ++--
 kernel/riscv64/dgemm_kernel_zvl128b.c | 4 ++--
 kernel/riscv64/dgemm_kernel_zvl256b.c | 4 ++--
 kernel/riscv64/sgemm_kernel_zvl128b.c | 4 ++--
 kernel/riscv64/sgemm_kernel_zvl256b.c | 4 ++--
 kernel/riscv64/zgemm_kernel_zvl128b.c | 4 ++--
 kernel/riscv64/zgemm_kernel_zvl256b.c | 4 ++--
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/kernel/riscv64/cgemm_kernel_zvl128b.c b/kernel/riscv64/cgemm_kernel_zvl128b.c
index c37d365775..6e9ac386d5 100644
--- a/kernel/riscv64/cgemm_kernel_zvl128b.c
+++ b/kernel/riscv64/cgemm_kernel_zvl128b.c
@@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#define M_BLOCKSIZE CGEMM_UNROLL_M
-#define N_BLOCKSIZE CGEMM_UNROLL_N
+#define M_BLOCKSIZE CGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE CGEMM_DEFAULT_UNROLL_N
 
 #if M_BLOCKSIZE == 4
 #define RVV_MUL __riscv_vfmul_vf_f32m1
diff --git a/kernel/riscv64/cgemm_kernel_zvl256b.c b/kernel/riscv64/cgemm_kernel_zvl256b.c
index 8b991bfc4f..729b99c7e2 100644
--- a/kernel/riscv64/cgemm_kernel_zvl256b.c
+++ b/kernel/riscv64/cgemm_kernel_zvl256b.c
@@ -28,8 +28,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#define M_BLOCKSIZE CGEMM_UNROLL_M
-#define N_BLOCKSIZE CGEMM_UNROLL_N
+#define M_BLOCKSIZE CGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE CGEMM_DEFAULT_UNROLL_N
 
 #if M_BLOCKSIZE == 8
 #define RVV_MUL __riscv_vfmul_vf_f32m1
diff --git a/kernel/riscv64/dgemm_kernel_zvl128b.c b/kernel/riscv64/dgemm_kernel_zvl128b.c
index 45a9400063..d25f57588a 100644
--- a/kernel/riscv64/dgemm_kernel_zvl128b.c
+++ b/kernel/riscv64/dgemm_kernel_zvl128b.c
@@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#define M_BLOCKSIZE DGEMM_UNROLL_M
-#define N_BLOCKSIZE DGEMM_UNROLL_N
+#define M_BLOCKSIZE DGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE DGEMM_DEFAULT_UNROLL_N
 
 #if M_BLOCKSIZE == 2
 #define RVV_MUL __riscv_vfmul_vf_f64m1
diff --git a/kernel/riscv64/dgemm_kernel_zvl256b.c b/kernel/riscv64/dgemm_kernel_zvl256b.c
index 84e66ab721..844c91e93e 100644
--- a/kernel/riscv64/dgemm_kernel_zvl256b.c
+++ b/kernel/riscv64/dgemm_kernel_zvl256b.c
@@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#define M_BLOCKSIZE DGEMM_UNROLL_M
-#define N_BLOCKSIZE DGEMM_UNROLL_N
+#define M_BLOCKSIZE DGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE DGEMM_DEFAULT_UNROLL_N
 
 #if M_BLOCKSIZE == 4
 #define RVV_MUL __riscv_vfmul_vf_f64m1
diff --git a/kernel/riscv64/sgemm_kernel_zvl128b.c b/kernel/riscv64/sgemm_kernel_zvl128b.c
index 60e9d62ac0..cfd27662e9 100644
--- a/kernel/riscv64/sgemm_kernel_zvl128b.c
+++ b/kernel/riscv64/sgemm_kernel_zvl128b.c
@@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#define M_BLOCKSIZE SGEMM_UNROLL_M
-#define N_BLOCKSIZE SGEMM_UNROLL_N
+#define M_BLOCKSIZE SGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE SGEMM_DEFAULT_UNROLL_N
 
 #if M_BLOCKSIZE == 4
 #define RVV_MUL __riscv_vfmul_vf_f32m1
diff --git a/kernel/riscv64/sgemm_kernel_zvl256b.c b/kernel/riscv64/sgemm_kernel_zvl256b.c
index 925e01d102..316adbbca5 100644
--- a/kernel/riscv64/sgemm_kernel_zvl256b.c
+++ b/kernel/riscv64/sgemm_kernel_zvl256b.c
@@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#define M_BLOCKSIZE SGEMM_UNROLL_M
-#define N_BLOCKSIZE SGEMM_UNROLL_N
+#define M_BLOCKSIZE SGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE SGEMM_DEFAULT_UNROLL_N
 
 #if M_BLOCKSIZE == 8
 #define RVV_MUL __riscv_vfmul_vf_f32m1
diff --git a/kernel/riscv64/zgemm_kernel_zvl128b.c b/kernel/riscv64/zgemm_kernel_zvl128b.c
index ba8496e425..ad8773cc4b 100644
--- a/kernel/riscv64/zgemm_kernel_zvl128b.c
+++ b/kernel/riscv64/zgemm_kernel_zvl128b.c
@@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#define M_BLOCKSIZE ZGEMM_UNROLL_M
-#define N_BLOCKSIZE ZGEMM_UNROLL_N
+#define M_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_N
 
 #if M_BLOCKSIZE == 2
 #define RVV_MUL __riscv_vfmul_vf_f64m1
diff --git a/kernel/riscv64/zgemm_kernel_zvl256b.c b/kernel/riscv64/zgemm_kernel_zvl256b.c
index ff32ba60a5..084bedb080 100644
--- a/kernel/riscv64/zgemm_kernel_zvl256b.c
+++ b/kernel/riscv64/zgemm_kernel_zvl256b.c
@@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#define M_BLOCKSIZE ZGEMM_UNROLL_M
-#define N_BLOCKSIZE ZGEMM_UNROLL_N
+#define M_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_M
+#define N_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_N
 
 #if M_BLOCKSIZE == 4
 #define RVV_MUL __riscv_vfmul_vf_f64m1