From 744eda853c1e3b9d3ff5b2e0dee49fe55c624dae Mon Sep 17 00:00:00 2001 From: Kwok Cheung Yeung Date: Wed, 3 Dec 2025 15:44:23 +0000 Subject: [PATCH 1/6] Add new SGEMM/DGEMM kernels for RISC-V ZVL128B/ZVL256B architectures These kernels use the same algorithm as the original ones, but improve on them in several respects: - Common code is shared between the two kernel types and architectures, reducing the overall lines of code. - Instead of using auto-generated code with each vector operation repeated multiple times (necessitated in part due to the vector types not being storable in arrays), macros and forced inlining are used to achieve the same effect directly for better clarity and a major reduction in lines of code. - The tiling sizes can be modified within a certain range by modifying the unroll parameters. - The RVV extension is not restricted to having the number of elements in a vector register being a power of two, so the tails of the matrix involving vector operations can be dealt with in a single operation rather than in decreasing powers of two, thereby improving performance. The tcopy operations need to be modified to take this into account. --- kernel/riscv64/dgemm_kernel_zvl128b.c | 55 ++++++ kernel/riscv64/dgemm_kernel_zvl256b.c | 55 ++++++ kernel/riscv64/gemm_kernel_rvv_vlv_common.h | 179 ++++++++++++++++++++ kernel/riscv64/gemm_tcopy_16_rvv_max.c | 42 +++++ kernel/riscv64/gemm_tcopy_4_rvv_max.c | 42 +++++ kernel/riscv64/gemm_tcopy_8_rvv_max.c | 42 +++++ kernel/riscv64/gemm_tcopy_rvv_max_common.h | 67 ++++++++ kernel/riscv64/sgemm_kernel_zvl128b.c | 55 ++++++ kernel/riscv64/sgemm_kernel_zvl256b.c | 49 ++++++ 9 files changed, 586 insertions(+) create mode 100644 kernel/riscv64/dgemm_kernel_zvl128b.c create mode 100644 kernel/riscv64/dgemm_kernel_zvl256b.c create mode 100644 kernel/riscv64/gemm_kernel_rvv_vlv_common.h create mode 100644 kernel/riscv64/gemm_tcopy_16_rvv_max.c create mode 100644 kernel/riscv64/gemm_tcopy_4_rvv_max.c create mode 100644 kernel/riscv64/gemm_tcopy_8_rvv_max.c create mode 100644 kernel/riscv64/gemm_tcopy_rvv_max_common.h create mode 100644 kernel/riscv64/sgemm_kernel_zvl128b.c create mode 100644 kernel/riscv64/sgemm_kernel_zvl256b.c diff --git a/kernel/riscv64/dgemm_kernel_zvl128b.c b/kernel/riscv64/dgemm_kernel_zvl128b.c new file mode 100644 index 0000000000..45a9400063 --- /dev/null +++ b/kernel/riscv64/dgemm_kernel_zvl128b.c @@ -0,0 +1,55 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define M_BLOCKSIZE DGEMM_UNROLL_M +#define N_BLOCKSIZE DGEMM_UNROLL_N + +#if M_BLOCKSIZE == 2 +#define RVV_MUL __riscv_vfmul_vf_f64m1 +#define RVV_MACC __riscv_vfmacc_vf_f64m1 +#define RVV_LOAD __riscv_vle64_v_f64m1 +#define RVV_STORE __riscv_vse64_v_f64m1 +#define VECTOR_T vfloat64m1_t +#elif M_BLOCKSIZE == 4 +#define RVV_MUL __riscv_vfmul_vf_f64m2 +#define RVV_MACC __riscv_vfmacc_vf_f64m2 +#define RVV_LOAD __riscv_vle64_v_f64m2 +#define RVV_STORE __riscv_vse64_v_f64m2 +#define VECTOR_T vfloat64m2_t +#elif M_BLOCKSIZE == 8 +#define RVV_MUL __riscv_vfmul_vf_f64m4 +#define RVV_MACC __riscv_vfmacc_vf_f64m4 +#define RVV_LOAD __riscv_vle64_v_f64m4 +#define RVV_STORE __riscv_vse64_v_f64m4 +#define VECTOR_T vfloat64m4_t +#else +#error "Unsupported M_BLOCKSIZE value" +#endif + +#include "gemm_kernel_rvv_vlv_common.h" diff --git a/kernel/riscv64/dgemm_kernel_zvl256b.c b/kernel/riscv64/dgemm_kernel_zvl256b.c new file mode 100644 index 0000000000..84e66ab721 --- /dev/null +++ b/kernel/riscv64/dgemm_kernel_zvl256b.c @@ -0,0 +1,55 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define M_BLOCKSIZE DGEMM_UNROLL_M +#define N_BLOCKSIZE DGEMM_UNROLL_N + +#if M_BLOCKSIZE == 4 +#define RVV_MUL __riscv_vfmul_vf_f64m1 +#define RVV_MACC __riscv_vfmacc_vf_f64m1 +#define RVV_LOAD __riscv_vle64_v_f64m1 +#define RVV_STORE __riscv_vse64_v_f64m1 +#define VECTOR_T vfloat64m1_t +#elif M_BLOCKSIZE == 8 +#define RVV_MUL __riscv_vfmul_vf_f64m2 +#define RVV_MACC __riscv_vfmacc_vf_f64m2 +#define RVV_LOAD __riscv_vle64_v_f64m2 +#define RVV_STORE __riscv_vse64_v_f64m2 +#define VECTOR_T vfloat64m2_t +#elif M_BLOCKSIZE == 16 +#define RVV_MUL __riscv_vfmul_vf_f64m4 +#define RVV_MACC __riscv_vfmacc_vf_f64m4 +#define RVV_LOAD __riscv_vle64_v_f64m4 +#define RVV_STORE __riscv_vse64_v_f64m4 +#define VECTOR_T vfloat64m4_t +#else +#error "Unsupported M_BLOCKSIZE value" +#endif + +#include "gemm_kernel_rvv_vlv_common.h" diff --git a/kernel/riscv64/gemm_kernel_rvv_vlv_common.h b/kernel/riscv64/gemm_kernel_rvv_vlv_common.h new file mode 100644 index 0000000000..31c2f0fdc6 --- /dev/null +++ b/kernel/riscv64/gemm_kernel_rvv_vlv_common.h @@ -0,0 +1,179 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define RISCV_REPEAT(INSN, BLEN, ...) \ +do { \ + INSN (0, __VA_ARGS__); \ + if (BLEN == 1) break; \ + INSN (1, __VA_ARGS__); \ + if (BLEN == 2) break; \ + INSN (2, __VA_ARGS__); \ + INSN (3, __VA_ARGS__); \ + if (BLEN == 4) break; \ + INSN (4, __VA_ARGS__); \ + INSN (5, __VA_ARGS__); \ + INSN (6, __VA_ARGS__); \ + INSN (7, __VA_ARGS__); \ +} while (0) + +#define RISCV_MUL(N, DEST, A, B, LEN) \ + DEST##N = RVV_MUL(A, B[N], LEN); + +#define RISCV_ACC_MUL(N, DEST, A, B, LEN) \ + DEST##N = RVV_MACC(DEST##N, B[N], A, LEN); + +#define RISCV_ACC_MUL_CONST(N, DEST, A, B, LEN) \ + DEST##N = RVV_MACC(DEST##N, B, A##N, LEN); + +#define RISCV_LOAD(N, DEST, SRC, OFFSET, LDC, LEN) \ + DEST##N = RVV_LOAD((SRC) + (OFFSET) + N*(LDC), LEN); + +#define RISCV_STORE(N, DEST, SRC, OFFSET, LDC, LEN) \ + RVV_STORE((DEST) + (OFFSET) + N*(LDC), SRC##N, LEN); + +#define RISCV_LOAD_COLUMN(DEST, SRC, OFFSET, LEN) \ + DEST = RVV_LOAD((SRC) + (OFFSET), LEN); + +#define COPY_ROW(DEST, SRC, OFFSET, LEN) \ +do { \ + DEST[0] = SRC[OFFSET]; \ + if (LEN == 1) break; \ + DEST[1] = SRC[OFFSET + 1]; \ + if (LEN == 2) break; \ + DEST[2] = SRC[OFFSET + 2]; \ + DEST[3] = SRC[OFFSET + 3]; \ + if (LEN == 4) break; \ + DEST[4] = SRC[OFFSET + 4]; \ + DEST[5] = SRC[OFFSET + 5]; \ + DEST[6] = SRC[OFFSET + 6]; \ + DEST[7] = SRC[OFFSET + 7]; \ +} while (0) + +/* Perform matrix multiplication between submatrices: + A(m_size,K) * B(K,n_size) = C(m_size,n_size) */ + +static inline __attribute__((always_inline)) +BLASLONG kernel (BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, + FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, + BLASLONG m_top, BLASLONG n_top, + BLASLONG m_size, BLASLONG n_size) +{ + BLASLONG ai = m_top*K; + BLASLONG bi = n_top*K; + + /* b[0..n_size-1] = B(0, n_top)..B(0, n_top+n_size-1) */ + FLOAT b[N_BLOCKSIZE]; + COPY_ROW (b, B, bi, n_size); + bi += n_size; + + /* a[0..m_size-1] = A(m_top, 0)..A(m_top+m_size-1, 0) */ + VECTOR_T a; + RISCV_LOAD_COLUMN (a, A, ai, m_size); + ai += m_size; + + /* for I = 0..n_size-1 + resultI[0..m_size-1] = A(m_top..m_top+msize-1, 0) * B(0, ntop+I) */ + VECTOR_T result0, result1, result2, result3; + VECTOR_T result4, result5, result6, result7; + RISCV_REPEAT (RISCV_MUL, n_size, result, a, b, m_size); + + for (BLASLONG k = 1; k < K; k++) { + /* b[0..n_size-1] = B(k, n_top)..B(k, n_top+n_size-1) */ + COPY_ROW (b, B, bi, n_size); + bi += n_size; + + /* a[0..m_size-1] = A(m_top, k)..A(m_top+m_size-1, k) */ + RISCV_LOAD_COLUMN (a, A, ai, m_size); + ai += m_size; + + /* for I = 0..n_size-1 + resultI[0..m_size-1] += A(m_top..m_top+msize-1, k) * B(k, ntop+I) */ + RISCV_REPEAT (RISCV_ACC_MUL, n_size, result, a, b, m_size); + } + + BLASLONG ci = n_top * ldc + m_top; + VECTOR_T c0, c1, c2, c3, c4, c5, c6, c7; + + /* for I = 0..nsize-1 + cI[0..m_size-1] = C(m_top..m_top+m_size-1, n_top+I) + cI[0..m_size-1] += alpha * resultI[0..m_size-1] + C(mtop..m_top+m_size-1, n_top+I) = cI[0..m_size-1] */ + RISCV_REPEAT (RISCV_LOAD, n_size, c, C, ci, ldc, m_size); + RISCV_REPEAT (RISCV_ACC_MUL_CONST, n_size, c, result, alpha, m_size); + RISCV_REPEAT (RISCV_STORE, n_size, C, c, ci, ldc, m_size); + + return m_top + m_size; +} + +/* Perform matrix multiplication between submatrices: + A(M,K) * B(K,n_size) = C(M, n_size) */ + +static inline __attribute__((always_inline)) +BLASLONG kernel_column (BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, + FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, + BLASLONG n_top, BLASLONG n_size) +{ + BLASLONG m_top = 0; + + for (BLASLONG i = 0; i < M / M_BLOCKSIZE; i++) + m_top = kernel (M, N, K, alpha, A, B, C, ldc, m_top, n_top, M_BLOCKSIZE, n_size); + + if (M & (M_BLOCKSIZE - 1)) + kernel (M, N, K, alpha, A, B, C, ldc, m_top, n_top, M - m_top, n_size); + + return n_top + n_size; +} + +#define xstr(s) str(s) +#define str(s) #s + +/* Perform matrix multiplication between matrices: + A(M,K) * B(K,N) = C(M,N) */ + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) +{ + //fprintf(stderr, "%s (with VLV): M=%ld, N=%ld, K=%ld, ldc=%ld, m_blocksize=%d, n_blocksize=%d\n", xstr(CNAME), M, N, K, ldc, M_BLOCKSIZE, N_BLOCKSIZE); + BLASLONG n_top = 0; + + for (BLASLONG j = 0; j < N / N_BLOCKSIZE; j++) + n_top = kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, N_BLOCKSIZE); + +#if N_BLOCKSIZE > 4 + if (N & 4) + n_top = kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, 4); +#endif +#if N_BLOCKSIZE > 2 + if (N & 2) + n_top = kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, 2); +#endif +#if N_BLOCKSIZE > 1 + if (N & 1) + kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, 1); +#endif + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_16_rvv_max.c b/kernel/riscv64/gemm_tcopy_16_rvv_max.c new file mode 100644 index 0000000000..85975e28be --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_16_rvv_max.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#else +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#endif + +#define BLOCKSIZE 16 + +#include "gemm_tcopy_rvv_max_common.h" diff --git a/kernel/riscv64/gemm_tcopy_4_rvv_max.c b/kernel/riscv64/gemm_tcopy_4_rvv_max.c new file mode 100644 index 0000000000..15af02bd86 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_4_rvv_max.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#else +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#endif + +#define BLOCKSIZE 4 + +#include "gemm_tcopy_rvv_max_common.h" diff --git a/kernel/riscv64/gemm_tcopy_8_rvv_max.c b/kernel/riscv64/gemm_tcopy_8_rvv_max.c new file mode 100644 index 0000000000..0aa0358369 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_8_rvv_max.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#else +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#endif + +#define BLOCKSIZE 8 + +#include "gemm_tcopy_rvv_max_common.h" diff --git a/kernel/riscv64/gemm_tcopy_rvv_max_common.h b/kernel/riscv64/gemm_tcopy_rvv_max_common.h new file mode 100644 index 0000000000..425650cb35 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_rvv_max_common.h @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define xstr(s) str(s) +#define str(s) #s + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + IFLOAT *aoffset = a; + IFLOAT *boffset = b; + IFLOAT *boffset2 = b + m * (n & ~(BLOCKSIZE - 1)); + + //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", xstr(CNAME), m, n, lda); + + for(BLASLONG j = m; j > 0; j--) { + IFLOAT *aoffset1 = aoffset; + IFLOAT *boffset1 = boffset; + + aoffset += lda; + boffset += BLOCKSIZE; + + for(BLASLONG i = n / BLOCKSIZE; i > 0; i--) { + size_t vl = BLOCKSIZE; + + FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v, vl); + + aoffset1 += BLOCKSIZE; + boffset1 += BLOCKSIZE * m; + } + + if (n & (BLOCKSIZE - 1)) { + size_t vl = n & (BLOCKSIZE - 1); + + FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v, vl); + + boffset2 += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/sgemm_kernel_zvl128b.c b/kernel/riscv64/sgemm_kernel_zvl128b.c new file mode 100644 index 0000000000..60e9d62ac0 --- /dev/null +++ b/kernel/riscv64/sgemm_kernel_zvl128b.c @@ -0,0 +1,55 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define M_BLOCKSIZE SGEMM_UNROLL_M +#define N_BLOCKSIZE SGEMM_UNROLL_N + +#if M_BLOCKSIZE == 4 +#define RVV_MUL __riscv_vfmul_vf_f32m1 +#define RVV_MACC __riscv_vfmacc_vf_f32m1 +#define RVV_LOAD __riscv_vle32_v_f32m1 +#define RVV_STORE __riscv_vse32_v_f32m1 +#define VECTOR_T vfloat32m1_t +#elif M_BLOCKSIZE == 8 +#define RVV_MUL __riscv_vfmul_vf_f32m2 +#define RVV_MACC __riscv_vfmacc_vf_f32m2 +#define RVV_LOAD __riscv_vle32_v_f32m2 +#define RVV_STORE __riscv_vse32_v_f32m2 +#define VECTOR_T vfloat32m2_t +#elif M_BLOCKSIZE == 16 +#define RVV_MUL __riscv_vfmul_vf_f32m4 +#define RVV_MACC __riscv_vfmacc_vf_f32m4 +#define RVV_LOAD __riscv_vle32_v_f32m4 +#define RVV_STORE __riscv_vse32_v_f32m4 +#define VECTOR_T vfloat32m4_t +#else +#error "Unsupported M_BLOCKSIZE value" +#endif + +#include "gemm_kernel_rvv_vlv_common.h" diff --git a/kernel/riscv64/sgemm_kernel_zvl256b.c b/kernel/riscv64/sgemm_kernel_zvl256b.c new file mode 100644 index 0000000000..925e01d102 --- /dev/null +++ b/kernel/riscv64/sgemm_kernel_zvl256b.c @@ -0,0 +1,49 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define M_BLOCKSIZE SGEMM_UNROLL_M +#define N_BLOCKSIZE SGEMM_UNROLL_N + +#if M_BLOCKSIZE == 8 +#define RVV_MUL __riscv_vfmul_vf_f32m1 +#define RVV_MACC __riscv_vfmacc_vf_f32m1 +#define RVV_LOAD __riscv_vle32_v_f32m1 +#define RVV_STORE __riscv_vse32_v_f32m1 +#define VECTOR_T vfloat32m1_t +#elif M_BLOCKSIZE == 16 +#define RVV_MUL __riscv_vfmul_vf_f32m2 +#define RVV_MACC __riscv_vfmacc_vf_f32m2 +#define RVV_LOAD __riscv_vle32_v_f32m2 +#define RVV_STORE __riscv_vse32_v_f32m2 +#define VECTOR_T vfloat32m2_t +#else +#error "Unsupported M_BLOCKSIZE value" +#endif + +#include "gemm_kernel_rvv_vlv_common.h" From 65108acf546f179498cf18330e7e46793c691eaf Mon Sep 17 00:00:00 2001 From: Kwok Cheung Yeung Date: Wed, 3 Dec 2025 15:48:59 +0000 Subject: [PATCH 2/6] Add improved CGEMM and ZGEMM kernels for the RISC-V ZVL128B and ZVL256B architectures This applies the changes made to the SGEMM/DGEMM kernels to their complex CGEMM/ZGEMM equivalents. --- kernel/riscv64/cgemm_kernel_zvl128b.c | 52 ++++ kernel/riscv64/cgemm_kernel_zvl256b.c | 48 ++++ kernel/riscv64/zgemm_kernel_rvv_vlv_common.h | 268 +++++++++++++++++++ kernel/riscv64/zgemm_kernel_zvl128b.c | 52 ++++ kernel/riscv64/zgemm_kernel_zvl256b.c | 52 ++++ kernel/riscv64/zgemm_tcopy_16_rvv_max.c | 42 +++ kernel/riscv64/zgemm_tcopy_4_rvv_max.c | 42 +++ kernel/riscv64/zgemm_tcopy_8_rvv_max.c | 42 +++ kernel/riscv64/zgemm_tcopy_rvv_max_common.h | 71 +++++ 9 files changed, 669 insertions(+) create mode 100644 kernel/riscv64/cgemm_kernel_zvl128b.c create mode 100644 kernel/riscv64/cgemm_kernel_zvl256b.c create mode 100644 kernel/riscv64/zgemm_kernel_rvv_vlv_common.h create mode 100644 kernel/riscv64/zgemm_kernel_zvl128b.c create mode 100644 kernel/riscv64/zgemm_kernel_zvl256b.c create mode 100644 kernel/riscv64/zgemm_tcopy_16_rvv_max.c create mode 100644 kernel/riscv64/zgemm_tcopy_4_rvv_max.c create mode 100644 kernel/riscv64/zgemm_tcopy_8_rvv_max.c create mode 100644 kernel/riscv64/zgemm_tcopy_rvv_max_common.h diff --git a/kernel/riscv64/cgemm_kernel_zvl128b.c b/kernel/riscv64/cgemm_kernel_zvl128b.c new file mode 100644 index 0000000000..c37d365775 --- /dev/null +++ b/kernel/riscv64/cgemm_kernel_zvl128b.c @@ -0,0 +1,52 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define M_BLOCKSIZE CGEMM_UNROLL_M +#define N_BLOCKSIZE CGEMM_UNROLL_N + +#if M_BLOCKSIZE == 4 +#define RVV_MUL __riscv_vfmul_vf_f32m1 +#define RVV_LOAD __riscv_vlse32_v_f32m1 +#define RVV_STORE __riscv_vsse32_v_f32m1 +#define VECTOR_T vfloat32m1_t +#elif M_BLOCKSIZE == 8 +#define RVV_MUL __riscv_vfmul_vf_f32m2 +#define RVV_LOAD __riscv_vlse32_v_f32m2 +#define RVV_STORE __riscv_vsse32_v_f32m2 +#define VECTOR_T vfloat32m2_t +#elif M_BLOCKSIZE == 16 +#define RVV_MUL __riscv_vfmul_vf_f32m4 +#define RVV_LOAD __riscv_vlse32_v_f32m4 +#define RVV_STORE __riscv_vsse32_v_f32m4 +#define VECTOR_T vfloat32m4_t +#else +#error "Unsupported M_BLOCKSIZE value" +#endif + +#include "zgemm_kernel_rvv_vlv_common.h" diff --git a/kernel/riscv64/cgemm_kernel_zvl256b.c b/kernel/riscv64/cgemm_kernel_zvl256b.c new file mode 100644 index 0000000000..8b991bfc4f --- /dev/null +++ b/kernel/riscv64/cgemm_kernel_zvl256b.c @@ -0,0 +1,48 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#define M_BLOCKSIZE CGEMM_UNROLL_M +#define N_BLOCKSIZE CGEMM_UNROLL_N + +#if M_BLOCKSIZE == 8 +#define RVV_MUL __riscv_vfmul_vf_f32m1 +#define RVV_LOAD __riscv_vlse32_v_f32m1 +#define RVV_STORE __riscv_vsse32_v_f32m1 +#define VECTOR_T vfloat32m1_t +#elif M_BLOCKSIZE == 16 +#define RVV_MUL __riscv_vfmul_vf_f32m2 +#define RVV_LOAD __riscv_vlse32_v_f32m2 +#define RVV_STORE __riscv_vsse32_v_f32m2 +#define VECTOR_T vfloat32m2_t +#else +#error "Unsupported M_BLOCKSIZE value" +#endif + +#include "zgemm_kernel_rvv_vlv_common.h" diff --git a/kernel/riscv64/zgemm_kernel_rvv_vlv_common.h b/kernel/riscv64/zgemm_kernel_rvv_vlv_common.h new file mode 100644 index 0000000000..02f443f711 --- /dev/null +++ b/kernel/riscv64/zgemm_kernel_rvv_vlv_common.h @@ -0,0 +1,268 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define S0 1 + #define S1 -1 + #define S2 1 + #define S3 1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define S0 1 + #define S1 1 + #define S2 1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define S0 1 + #define S1 1 + #define S2 -1 + #define S3 1 + #define VFMACC_RR __riscv_vfmacc + #define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define S0 1 + #define S1 -1 + #define S2 -1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmsac + #define VFMACC_RI __riscv_vfnmacc +#endif + +#define RISCV_REPEAT_1(INSN, BLEN, ...) \ +do { \ + INSN (0, 0, __VA_ARGS__); \ + if (BLEN == 1) break; \ + INSN (1, 1, __VA_ARGS__); \ + if (BLEN == 2) break; \ + INSN (2, 2, __VA_ARGS__); \ + INSN (3, 3, __VA_ARGS__); \ +} while (0) + +#define RISCV_REPEAT_2(INSN, BLEN, ...) \ +do { \ + if (BLEN <= 4) break; \ + INSN (0, 4, __VA_ARGS__); \ + INSN (1, 5, __VA_ARGS__); \ + INSN (2, 6, __VA_ARGS__); \ + INSN (3, 7, __VA_ARGS__); \ +} while (0) + +#define RISCV_MUL(M, N, DESTr, DESTi, Ar, Ai, Bi, GVL) \ + DESTr##M = RVV_MUL(Ai, Bi[N], GVL); \ + DESTi##M = RVV_MUL(Ar, Bi[N], GVL); + +#define RISCV_VFMACC(M, N, DESTr, DESTi, Ar, Ai, Br, GVL) \ + DESTr##M = VFMACC_RR(DESTr##M, Br[N], Ar, GVL); \ + DESTi##M = VFMACC_RI(DESTi##M, Br[N], Ai, GVL); + +#define RISCV_ACC_MUL_CONSTR(M, N, DESTr, DESTi, Ar, Ai, B, GVL) \ + DESTr##N = __riscv_vfmacc(DESTr##N, B, Ar##N, GVL); \ + DESTi##N = __riscv_vfmacc(DESTi##N, B, Ai##N, GVL); + +#define RISCV_ACC_MUL_CONSTI(M, N, DESTr, DESTi, Ar, Ai, B, GVL) \ + DESTr##N = __riscv_vfnmsac(DESTr##N, B, Ai##N, GVL); \ + DESTi##N = __riscv_vfmacc(DESTi##N, B, Ar##N, GVL); + +#define RISCV_LOAD(M, N, DESTr, DESTi, SRC, OFFSET, LDC, GVL) \ + DESTr##N = RVV_LOAD((SRC) + ((OFFSET) + N*(LDC)) * 2, sizeof(FLOAT)*2, GVL); \ + DESTi##N = RVV_LOAD((SRC) + ((OFFSET) + N*(LDC)) * 2 + 1, sizeof(FLOAT)*2, GVL); + +#define RISCV_STORE(M, N, DEST, SRCr, SRCi, OFFSET, LDC, GVL) \ + RVV_STORE((DEST) + ((OFFSET) + N*(LDC)) * 2, sizeof(FLOAT)*2, SRCr##N, GVL); \ + RVV_STORE((DEST) + ((OFFSET) + N*(LDC)) * 2 + 1, sizeof(FLOAT)*2, SRCi##N, GVL); + +#define RISCV_LOAD_COLUMN(DESTR, DESTI, SRC, OFFSET, GVL) \ + DESTR = RVV_LOAD((SRC) + (OFFSET), sizeof (FLOAT)*2, GVL); \ + DESTI = RVV_LOAD((SRC) + (OFFSET) + 1, sizeof (FLOAT)*2, GVL); + +#define COPY_TMP(M, N, DESTr, DESTi, SRCr, SRCi) \ + DESTr##N = SRCr##M; \ + DESTi##N = SRCi##M; + +#define RISCV_ADD(M, N, DESTr, DESTi, SRCr, SRCi, GVL) \ + DESTr##N = __riscv_vfadd(DESTr##N, SRCr##M, GVL); \ + DESTi##N = __riscv_vfadd(DESTi##N, SRCi##M, GVL); + +#define COPY_ROW(DESTR, DESTI, SRC, OFFSET, LEN) \ +do { \ + DESTR[0] = SRC[OFFSET]; \ + DESTI[0] = SRC[OFFSET + 1]; \ + if (LEN == 1) break; \ + DESTR[1] = SRC[OFFSET + 2]; \ + DESTI[1] = SRC[OFFSET + 3]; \ + if (LEN == 2) break; \ + DESTR[2] = SRC[OFFSET + 4]; \ + DESTI[2] = SRC[OFFSET + 5]; \ + DESTR[3] = SRC[OFFSET + 6]; \ + DESTI[3] = SRC[OFFSET + 7]; \ + if (LEN == 4) break; \ + DESTR[4] = SRC[OFFSET + 8]; \ + DESTI[4] = SRC[OFFSET + 9]; \ + DESTR[5] = SRC[OFFSET + 10]; \ + DESTI[5] = SRC[OFFSET + 11]; \ + DESTR[6] = SRC[OFFSET + 12]; \ + DESTI[6] = SRC[OFFSET + 13]; \ + DESTR[7] = SRC[OFFSET + 14]; \ + DESTI[7] = SRC[OFFSET + 15]; \ +} while (0) + +/* Perform matrix multiplication between submatrices: + A(m_size,K) * B(K,n_size) = C(m_size,n_size) */ + +static inline __attribute__((always_inline)) +BLASLONG kernel (BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, + FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, + BLASLONG m_top, BLASLONG n_top, + BLASLONG m_size, BLASLONG n_size) +{ + BLASLONG ai = m_top*K*2; + BLASLONG bi = n_top*K*2; + + /* b_r[0..n_size-1] = real(B(0, n_top)..B(0, n_top+n_size-1)) + b_i[0..n_size-1] = imag(B(0, n_top)..B(0, n_top+n_size-1)) */ + FLOAT b_r[N_BLOCKSIZE], b_i[N_BLOCKSIZE]; + COPY_ROW (b_r, b_i, B, bi, n_size); + bi += n_size * 2; + + /* a_r[0..m_size-1] = real(A(m_top, 0)..A(m_top+m_size-1, 0)) + a_i[0..m_size-1] = imag(A(m_top, 0)..A(m_top+m_size-1, 0)) */ + VECTOR_T a_r, a_i; + RISCV_LOAD_COLUMN (a_r, a_i, A, ai, m_size); + ai += m_size * 2; + + /* for I = 0..n_size-1 + acc_rI[0..m_size-1] = real(A(m_top..m_top+msize-1, 0) * B(0, ntop+I)) + acc_iI[0..m_size-1] = imag(A(m_top..m_top+msize-1, 0) * B(0, ntop+I)) */ + VECTOR_T tmp_r0, tmp_i0, tmp_r1, tmp_i1, tmp_r2, tmp_i2, tmp_r3, tmp_i3; + VECTOR_T acc_r0, acc_i0, acc_r1, acc_i1, acc_r2, acc_i2, acc_r3, acc_i3; + VECTOR_T acc_r4, acc_i4, acc_r5, acc_i5, acc_r6, acc_i6, acc_r7, acc_i7; + RISCV_REPEAT_1 (RISCV_MUL, n_size, tmp_r, tmp_i, a_r, a_i, b_i, m_size); + RISCV_REPEAT_1 (RISCV_VFMACC, n_size, tmp_r, tmp_i, a_r, a_i, b_r, m_size); + RISCV_REPEAT_1 (COPY_TMP, n_size, acc_r, acc_i, tmp_r, tmp_i); + RISCV_REPEAT_2 (RISCV_MUL, n_size, tmp_r, tmp_i, a_r, a_i, b_i, m_size); + RISCV_REPEAT_2 (RISCV_VFMACC, n_size, tmp_r, tmp_i, a_r, a_i, b_r, m_size); + RISCV_REPEAT_2 (COPY_TMP, n_size, acc_r, acc_i, tmp_r, tmp_i); + + for (BLASLONG k = 1; k < K; k++) { + /* b_r[0..n_size-1] = real(B(k, n_top)..B(k, n_top+n_size-1)) + b_i[0..n_size-1] = imag(B(k, n_top)..B(k, n_top+n_size-1)) */ + COPY_ROW (b_r, b_i, B, bi, n_size); + bi += n_size * 2; + + /* a_r[0..m_size-1] = real(A(m_top, k)..A(m_top+m_size-1, k)) + a_i[0..m_size-1] = imag(A(m_top, k)..A(m_top+m_size-1, k)) */ + RISCV_LOAD_COLUMN (a_r, a_i, A, ai, m_size); + ai += m_size * 2; + + /* for I = 0..n_size-1 + acc_rI[0..m_size-1] += real(A(m_top..m_top+msize-1, k) * B(k, ntop+I)) + acc_iI[0..m_size-1] += imag(A(m_top..m_top+msize-1, k) * B(k, ntop+I)) */ + RISCV_REPEAT_1 (RISCV_MUL, n_size, tmp_r, tmp_i, a_r, a_i, b_i, m_size); + RISCV_REPEAT_1 (RISCV_VFMACC, n_size, tmp_r, tmp_i, a_r, a_i, b_r, m_size); + RISCV_REPEAT_1 (RISCV_ADD, n_size, acc_r, acc_i, tmp_r, tmp_i, m_size); + RISCV_REPEAT_2 (RISCV_MUL, n_size, tmp_r, tmp_i, a_r, a_i, b_i, m_size); + RISCV_REPEAT_2 (RISCV_VFMACC, n_size, tmp_r, tmp_i, a_r, a_i, b_r, m_size); + RISCV_REPEAT_2 (RISCV_ADD, n_size, acc_r, acc_i, tmp_r, tmp_i, m_size); + } + + BLASLONG ci = n_top * ldc + m_top; + VECTOR_T c_r0, c_i0, c_r1, c_i1, c_r2, c_i2, c_r3, c_i3; + VECTOR_T c_r4, c_i4, c_r5, c_i5, c_r6, c_i6, c_r7, c_i7; + + /* for I = 0..nsize-1 + c_rI[0..m_size-1] = real(C(m_top..m_top+m_size-1, n_top+I)) + c_iI[0..m_size-1] = imag(C(m_top..m_top+m_size-1, n_top+I)) + c_rI[0..m_size-1] += alpha_r * acc_rI[0..m_size-1] + c_iI[0..m_size-1] += alpha_i * acc_iI[0..m_size-1] + real(C(mtop..m_top+m_size-1, n_top+I)) = c_rI[0..m_size-1] + imag(C(mtop..m_top+m_size-1, n_top+I)) = c_iI[0..m_size-1] */ + RISCV_REPEAT_1 (RISCV_LOAD, n_size, c_r, c_i, C, ci, ldc, m_size); + RISCV_REPEAT_2 (RISCV_LOAD, n_size, c_r, c_i, C, ci, ldc, m_size); + RISCV_REPEAT_1 (RISCV_ACC_MUL_CONSTR, n_size, c_r, c_i, acc_r, acc_i, alphar, m_size); + RISCV_REPEAT_2 (RISCV_ACC_MUL_CONSTR, n_size, c_r, c_i, acc_r, acc_i, alphar, m_size); + RISCV_REPEAT_1 (RISCV_ACC_MUL_CONSTI, n_size, c_r, c_i, acc_r, acc_i, alphai, m_size); + RISCV_REPEAT_2 (RISCV_ACC_MUL_CONSTI, n_size, c_r, c_i, acc_r, acc_i, alphai, m_size); + RISCV_REPEAT_1 (RISCV_STORE, n_size, C, c_r, c_i, ci, ldc, m_size); + RISCV_REPEAT_2 (RISCV_STORE, n_size, C, c_r, c_i, ci, ldc, m_size); + + return m_top + m_size; +} + +/* Perform matrix multiplication between submatrices: + A(M,K) * B(K,n_size) = C(M, n_size) */ + +static inline __attribute__((always_inline)) +BLASLONG kernel_column (BLASLONG M, BLASLONG N, BLASLONG K, + FLOAT alphar, FLOAT alphai, + FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, + BLASLONG n_top, BLASLONG n_size) +{ + BLASLONG m_top = 0; + + for (BLASLONG i = 0; i < M / M_BLOCKSIZE; i++) + m_top = kernel (M, N, K, alphar, alphai, A, B, C, ldc, m_top, n_top, M_BLOCKSIZE, n_size); + + if (M & (M_BLOCKSIZE - 1)) + kernel (M, N, K, alphar, alphai, A, B, C, ldc, m_top, n_top, M - m_top, n_size); + + return n_top + n_size; +} + +#define xstr(s) str(s) +#define str(s) #s + +/* Perform matrix multiplication between matrices: + A(M,K) * B(K,N) = C(M,N) */ + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) +{ + //fprintf(stderr, "%s (with VLV): M=%ld, N=%ld, K=%ld, ldc=%ld\n", xstr(CNAME), M, N, K, ldc); + BLASLONG n_top = 0; + + for (BLASLONG j = 0; j < N / N_BLOCKSIZE; j++) + n_top = kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, N_BLOCKSIZE); + +#if N_BLOCKSIZE > 4 + if (N & 4) + n_top = kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, 4); +#endif +#if N_BLOCKSIZE > 2 + if (N & 2) + n_top = kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, 2); +#endif +#if N_BLOCKSIZE > 1 + if (N & 1) + kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, 1); +#endif + return 0; +} diff --git a/kernel/riscv64/zgemm_kernel_zvl128b.c b/kernel/riscv64/zgemm_kernel_zvl128b.c new file mode 100644 index 0000000000..ba8496e425 --- /dev/null +++ b/kernel/riscv64/zgemm_kernel_zvl128b.c @@ -0,0 +1,52 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define M_BLOCKSIZE ZGEMM_UNROLL_M +#define N_BLOCKSIZE ZGEMM_UNROLL_N + +#if M_BLOCKSIZE == 2 +#define RVV_MUL __riscv_vfmul_vf_f64m1 +#define RVV_LOAD __riscv_vlse64_v_f64m1 +#define RVV_STORE __riscv_vsse64_v_f64m1 +#define VECTOR_T vfloat64m1_t +#elif M_BLOCKSIZE == 4 +#define RVV_MUL __riscv_vfmul_vf_f64m2 +#define RVV_LOAD __riscv_vlse64_v_f64m2 +#define RVV_STORE __riscv_vsse64_v_f64m2 +#define VECTOR_T vfloat64m2_t +#elif M_BLOCKSIZE == 8 +#define RVV_MUL __riscv_vfmul_vf_f64m4 +#define RVV_LOAD __riscv_vlse64_v_f64m4 +#define RVV_STORE __riscv_vsse64_v_f64m4 +#define VECTOR_T vfloat64m4_t +#else +#error "Unsupported M_BLOCKSIZE value" +#endif + +#include "zgemm_kernel_rvv_vlv_common.h" diff --git a/kernel/riscv64/zgemm_kernel_zvl256b.c b/kernel/riscv64/zgemm_kernel_zvl256b.c new file mode 100644 index 0000000000..ff32ba60a5 --- /dev/null +++ b/kernel/riscv64/zgemm_kernel_zvl256b.c @@ -0,0 +1,52 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define M_BLOCKSIZE ZGEMM_UNROLL_M +#define N_BLOCKSIZE ZGEMM_UNROLL_N + +#if M_BLOCKSIZE == 4 +#define RVV_MUL __riscv_vfmul_vf_f64m1 +#define RVV_LOAD __riscv_vlse64_v_f64m1 +#define RVV_STORE __riscv_vsse64_v_f64m1 +#define VECTOR_T vfloat64m1_t +#elif M_BLOCKSIZE == 8 +#define RVV_MUL __riscv_vfmul_vf_f64m2 +#define RVV_LOAD __riscv_vlse64_v_f64m2 +#define RVV_STORE __riscv_vsse64_v_f64m2 +#define VECTOR_T vfloat64m2_t +#elif M_BLOCKSIZE == 16 +#define RVV_MUL __riscv_vfmul_vf_f64m4 +#define RVV_LOAD __riscv_vlse64_v_f64m4 +#define RVV_STORE __riscv_vsse64_v_f64m4 +#define VECTOR_T vfloat64m4_t +#else +#error "Unsupported M_BLOCKSIZE value" +#endif + +#include "zgemm_kernel_rvv_vlv_common.h" diff --git a/kernel/riscv64/zgemm_tcopy_16_rvv_max.c b/kernel/riscv64/zgemm_tcopy_16_rvv_max.c new file mode 100644 index 0000000000..8545989257 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_16_rvv_max.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#else +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#endif + +#define BLOCKSIZE 16 + +#include "zgemm_tcopy_rvv_max_common.h" diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv_max.c b/kernel/riscv64/zgemm_tcopy_4_rvv_max.c new file mode 100644 index 0000000000..07b86a4862 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_4_rvv_max.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#else +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#endif + +#define BLOCKSIZE 4 + +#include "zgemm_tcopy_rvv_max_common.h" diff --git a/kernel/riscv64/zgemm_tcopy_8_rvv_max.c b/kernel/riscv64/zgemm_tcopy_8_rvv_max.c new file mode 100644 index 0000000000..53b28a5652 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_8_rvv_max.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#else +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#endif + +#define BLOCKSIZE 8 + +#include "zgemm_tcopy_rvv_max_common.h" diff --git a/kernel/riscv64/zgemm_tcopy_rvv_max_common.h b/kernel/riscv64/zgemm_tcopy_rvv_max_common.h new file mode 100644 index 0000000000..c0417a5e37 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_rvv_max_common.h @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define xstr(s) str(s) +#define str(s) #s + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + IFLOAT *aoffset = a; + IFLOAT *boffset = b; + IFLOAT *boffset2 = b + m * (n & ~(BLOCKSIZE - 1)) * 2; + + //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", xstr(CNAME), m, n, lda); + + for (BLASLONG j = m; j > 0; j--) { + IFLOAT *aoffset1 = aoffset; + IFLOAT *boffset1 = boffset; + + aoffset += lda * 2; + boffset += BLOCKSIZE * 2; + + for (BLASLONG i = n / BLOCKSIZE; i > 0; i--) { + size_t vl = BLOCKSIZE; + + FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v, vl); + v = VLEV_FLOAT(aoffset1 + vl, vl); + VSEV_FLOAT(boffset1 + vl, v, vl); + + aoffset1 += BLOCKSIZE * 2; + boffset1 += BLOCKSIZE * m * 2; + } + + if (n & (BLOCKSIZE - 1)) { + size_t vl = n & (BLOCKSIZE - 1); + + FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v, vl); + v = VLEV_FLOAT(aoffset1 + vl, vl); + VSEV_FLOAT(boffset2 + vl, v, vl); + + boffset2 += vl * 2; + } + } + + return 0; +} From 320e2d9331532e66ca890179a17a44e32670399a Mon Sep 17 00:00:00 2001 From: Kwok Cheung Yeung Date: Wed, 3 Dec 2025 20:12:21 +0000 Subject: [PATCH 3/6] Replace original RISC-V ZVL kernels with new kernels The original kernels have been deleted with this commit. --- kernel/riscv64/KERNEL.RISCV64_ZVL128B | 24 +- kernel/riscv64/KERNEL.RISCV64_ZVL256B | 25 +- kernel/riscv64/cgemm_kernel_8x4_zvl128b.c | 996 ---------- kernel/riscv64/cgemm_kernel_8x8_zvl256b.c | 1931 -------------------- kernel/riscv64/dgemm_kernel_8x4_zvl128b.c | 492 ----- kernel/riscv64/dgemm_kernel_8x8_zvl256b.c | 860 --------- kernel/riscv64/sgemm_kernel_16x8_zvl256b.c | 1081 ----------- kernel/riscv64/sgemm_kernel_8x8_zvl128b.c | 791 -------- kernel/riscv64/zgemm_kernel_4x4_zvl128b.c | 720 -------- kernel/riscv64/zgemm_kernel_8x4_zvl256b.c | 1253 ------------- 10 files changed, 25 insertions(+), 8148 deletions(-) delete mode 100644 kernel/riscv64/cgemm_kernel_8x4_zvl128b.c delete mode 100644 kernel/riscv64/cgemm_kernel_8x8_zvl256b.c delete mode 100644 kernel/riscv64/dgemm_kernel_8x4_zvl128b.c delete mode 100644 kernel/riscv64/dgemm_kernel_8x8_zvl256b.c delete mode 100644 kernel/riscv64/sgemm_kernel_16x8_zvl256b.c delete mode 100644 kernel/riscv64/sgemm_kernel_8x8_zvl128b.c delete mode 100644 kernel/riscv64/zgemm_kernel_4x4_zvl128b.c delete mode 100644 kernel/riscv64/zgemm_kernel_8x4_zvl256b.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B index ad7db5622e..2f8013eeed 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL128B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B @@ -95,10 +95,10 @@ DGEMVTKERNEL = gemv_t_rvv.c CGEMVTKERNEL = zgemv_t_rvv.c ZGEMVTKERNEL = zgemv_t_rvv.c -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +SGEMMKERNEL = sgemm_kernel_zvl128b.c ifneq ($(filter $(SGEMM_UNROLL_N),4 8 16),) SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c -SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv_max.c else SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c @@ -109,7 +109,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) ifneq ($(filter $(SGEMM_UNROLL_M),4 8 16),) SGEMMINCOPY = gemm_ncopy_$(SGEMM_UNROLL_M)_rvv.c -SGEMMITCOPY = gemm_tcopy_$(SGEMM_UNROLL_M)_rvv.c +SGEMMITCOPY = gemm_tcopy_$(SGEMM_UNROLL_M)_rvv_max.c else SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c @@ -118,10 +118,10 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DGEMMKERNEL = dgemm_kernel_zvl128b.c DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c ifneq ($(filter $(DGEMM_UNROLL_N),4 8 16),) -DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv_max.c else DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif @@ -131,7 +131,7 @@ DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c ifneq ($(filter $(DGEMM_UNROLL_M),4 8 16),) -DGEMMITCOPY = gemm_tcopy_$(DGEMM_UNROLL_M)_rvv.c +DGEMMITCOPY = gemm_tcopy_$(DGEMM_UNROLL_M)_rvv_max.c else DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif @@ -139,28 +139,28 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CGEMMKERNEL = cgemm_kernel_zvl128b.c CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = zgemm_tcopy_$(CGEMM_UNROLL_N)_rvv_max.c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = zgemm_tcopy_$(CGEMM_UNROLL_M)_rvv_max.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZGEMMKERNEL = zgemm_kernel_zvl128b.c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = zgemm_tcopy_$(ZGEMM_UNROLL_N)_rvv_max.c ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = zgemm_tcopy_$(ZGEMM_UNROLL_M)_rvv_max.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index c48095bb21..650169d3e5 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -95,10 +95,10 @@ DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c +SGEMMKERNEL = sgemm_kernel_zvl256b.c ifneq ($(filter $(SGEMM_UNROLL_N),4 8 16),) SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c -SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv_max.c else SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c @@ -108,7 +108,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) ifneq ($(filter $(SGEMM_UNROLL_M),4 8 16),) SGEMMINCOPY = gemm_ncopy_$(SGEMM_UNROLL_M)_rvv.c -SGEMMITCOPY = gemm_tcopy_$(SGEMM_UNROLL_M)_rvv.c +SGEMMITCOPY = gemm_tcopy_$(SGEMM_UNROLL_M)_rvv_max.c else SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c @@ -117,19 +117,20 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c +DGEMMKERNEL = dgemm_kernel_zvl256b.c DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c ifneq ($(filter $(DGEMM_UNROLL_N),4 8 16),) -DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv_max.c else DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif + DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c ifneq ($(filter $(DGEMM_UNROLL_M),4 8 16),) -DGEMMITCOPY = gemm_tcopy_$(DGEMM_UNROLL_M)_rvv.c +DGEMMITCOPY = gemm_tcopy_$(DGEMM_UNROLL_M)_rvv_max.c else DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif @@ -137,28 +138,28 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c +CGEMMKERNEL = cgemm_kernel_zvl256b.c CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = zgemm_tcopy_$(CGEMM_UNROLL_N)_rvv_max.c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = zgemm_tcopy_$(CGEMM_UNROLL_M)_rvv_max.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c +ZGEMMKERNEL = zgemm_kernel_zvl256b.c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = zgemm_tcopy_$(ZGEMM_UNROLL_N)_rvv_max.c ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = zgemm_tcopy_$(ZGEMM_UNROLL_M)_rvv_max.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif diff --git a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c deleted file mode 100644 index bd615389c8..0000000000 --- a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c +++ /dev/null @@ -1,996 +0,0 @@ -/* - -AUTOGENERATED KERNEL -Script: ./kernel/riscv64/generate_kernel.py -Settings: - LMUL=2 - M=8 - M_tail_scalar_from=2 - N=4 - __riscv_='__riscv_' - complex=True - conjugate=False - cpu='zvl128b' - force_acc_double=False - index_type='BLASLONG' - op='gemm' - param_precision='float' - reg_width_bits=128 - tail_policy='' - trace=False - -Derived: - ELEN_ACC=32 - ELEN_PARAM=32 - LMUL_ACC=2 - VFMACC='__riscv_vfmacc_vf_f32m2' - VFMUL='__riscv_vfmul_vf_f32m2' - VLEV='__riscv_vle32_v_f32m2' - VLSEV='__riscv_vlse32_v_f32m2' - VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' - VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' - VSETVL='__riscv_vsetvl_e32m2' - VSEV='__riscv_vse32_v_f32m2' - VSSEV='__riscv_vsse32_v_f32m2' - acc_vector_t='vfloat32m2_t' - output='cgemm_kernel_8x4_zvl128b.c' - param_scalar_t='float' - param_vector_t='vfloat32m2_t' - -*/ - -#include "common.h" - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define S0 1 -#define S1 -1 -#define S2 1 -#define S3 1 -#define VFMACC_RR __riscv_vfmsac -#define VFMACC_RI __riscv_vfmacc -#endif -#if defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define S0 1 -#define S1 1 -#define S2 1 -#define S3 -1 -#define VFMACC_RR __riscv_vfmacc -#define VFMACC_RI __riscv_vfmsac -#endif -#if defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define S0 1 -#define S1 1 -#define S2 -1 -#define S3 1 -#define VFMACC_RR __riscv_vfmacc -#define VFMACC_RI __riscv_vfnmsac -#endif -#if defined(RR) || defined(RC) || defined(CR) || defined(CC) -#define S0 1 -#define S1 -1 -#define S2 -1 -#define S3 -1 -#define VFMACC_RR __riscv_vfmsac -#define VFMACC_RI __riscv_vfnmacc -#endif - -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) - -{ - BLASLONG gvl = 0; - BLASLONG m_top = 0; - BLASLONG n_top = 0; - - // -- MAIN PASS - - for (BLASLONG j = 0; j < N / 4; j += 1) { - m_top = 0; - BLASLONG gvl = __riscv_vsetvl_e32m2(8); - - for (BLASLONG i = 0; i < M / 8; i += 1) { - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - float B0r = B[bi + 0 * 2 + 0]; - float B0i = B[bi + 0 * 2 + 1]; - float B1r = B[bi + 1 * 2 + 0]; - float B1i = B[bi + 1 * 2 + 1]; - float B2r = B[bi + 2 * 2 + 0]; - float B2i = B[bi + 2 * 2 + 1]; - float B3r = B[bi + 3 * 2 + 0]; - float B3i = B[bi + 3 * 2 + 1]; - bi += 4 * 2; - - vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 8 * 2; - - // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k - // leaving 6 vector registers for temporaries - // performing 2 operations between reuses of temporaries - vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - vfloat32m2_t ACC0r = tmp0r; - vfloat32m2_t ACC0i = tmp0i; - vfloat32m2_t ACC1r = tmp1r; - vfloat32m2_t ACC1i = tmp1i; - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); - tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); - vfloat32m2_t ACC2r = tmp0r; - vfloat32m2_t ACC2i = tmp0i; - vfloat32m2_t ACC3r = tmp1r; - vfloat32m2_t ACC3i = tmp1i; - - for (BLASLONG k = 1; k < K; k++) { - B0r = B[bi + 0 * 2 + 0]; - B0i = B[bi + 0 * 2 + 1]; - B1r = B[bi + 1 * 2 + 0]; - B1i = B[bi + 1 * 2 + 1]; - B2r = B[bi + 2 * 2 + 0]; - B2i = B[bi + 2 * 2 + 1]; - B3r = B[bi + 3 * 2 + 0]; - B3i = B[bi + 3 * 2 + 1]; - bi += 4 * 2; - - A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 8 * 2; - - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); - ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); - ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); - ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); - tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); - ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); - ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); - ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); - ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); - } - - BLASLONG ci = n_top * ldc + m_top; - - vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - - C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); - C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); - C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); - C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); - C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); - C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); - C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); - C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); - C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); - C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); - C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); - C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); - C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); - C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); - C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); - C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); - - ci = n_top * ldc + m_top; - - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); - - m_top += 8; - } - - // -- tails for main pass - - if (M & 4) { - gvl = __riscv_vsetvl_e32m2(4); - - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - float B0r = B[bi + 0 * 2 + 0]; - float B0i = B[bi + 0 * 2 + 1]; - float B1r = B[bi + 1 * 2 + 0]; - float B1i = B[bi + 1 * 2 + 1]; - float B2r = B[bi + 2 * 2 + 0]; - float B2i = B[bi + 2 * 2 + 1]; - float B3r = B[bi + 3 * 2 + 0]; - float B3i = B[bi + 3 * 2 + 1]; - bi += 4 * 2; - - vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 4 * 2; - - // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k - // leaving 6 vector registers for temporaries - // performing 2 operations between reuses of temporaries - vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - vfloat32m2_t ACC0r = tmp0r; - vfloat32m2_t ACC0i = tmp0i; - vfloat32m2_t ACC1r = tmp1r; - vfloat32m2_t ACC1i = tmp1i; - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); - tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); - vfloat32m2_t ACC2r = tmp0r; - vfloat32m2_t ACC2i = tmp0i; - vfloat32m2_t ACC3r = tmp1r; - vfloat32m2_t ACC3i = tmp1i; - - for (BLASLONG k = 1; k < K; k++) { - B0r = B[bi + 0 * 2 + 0]; - B0i = B[bi + 0 * 2 + 1]; - B1r = B[bi + 1 * 2 + 0]; - B1i = B[bi + 1 * 2 + 1]; - B2r = B[bi + 2 * 2 + 0]; - B2i = B[bi + 2 * 2 + 1]; - B3r = B[bi + 3 * 2 + 0]; - B3i = B[bi + 3 * 2 + 1]; - bi += 4 * 2; - - A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 4 * 2; - - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); - ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); - ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); - ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); - tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); - ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); - ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); - ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); - ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); - } - - BLASLONG ci = n_top * ldc + m_top; - - vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - - C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); - C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); - C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); - C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); - C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); - C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); - C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); - C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); - C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); - C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); - C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); - C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); - C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); - C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); - C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); - C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); - - ci = n_top * ldc + m_top; - - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); - - m_top += 4; - } - - if (M & 2) { - float result0 = 0; - float result1 = 0; - float result2 = 0; - float result3 = 0; - float result4 = 0; - float result5 = 0; - float result6 = 0; - float result7 = 0; - float result8 = 0; - float result9 = 0; - float result10 = 0; - float result11 = 0; - float result12 = 0; - float result13 = 0; - float result14 = 0; - float result15 = 0; - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - - for (BLASLONG k = 0; k < K; k++) { - result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; - result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; - result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; - result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; - result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; - result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; - result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; - result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; - result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; - result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; - result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; - result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; - result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; - result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; - result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; - result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; - ai += 2 * 2; - bi += 4 * 2; - } - - BLASLONG ci = n_top * ldc + m_top; - float Cr, Ci; - Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; - Cr += result0 * alphar; - Ci += result1 * alphar; - Cr -= result1 * alphai; - Ci += result0 * alphai; - C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; - Cr += result2 * alphar; - Ci += result3 * alphar; - Cr -= result3 * alphai; - Ci += result2 * alphai; - C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; - Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; - Cr += result4 * alphar; - Ci += result5 * alphar; - Cr -= result5 * alphai; - Ci += result4 * alphai; - C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; - Cr += result6 * alphar; - Ci += result7 * alphar; - Cr -= result7 * alphai; - Ci += result6 * alphai; - C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; - Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; - Cr += result8 * alphar; - Ci += result9 * alphar; - Cr -= result9 * alphai; - Ci += result8 * alphai; - C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; - Cr += result10 * alphar; - Ci += result11 * alphar; - Cr -= result11 * alphai; - Ci += result10 * alphai; - C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; - Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; - Cr += result12 * alphar; - Ci += result13 * alphar; - Cr -= result13 * alphai; - Ci += result12 * alphai; - C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; - Cr += result14 * alphar; - Ci += result15 * alphar; - Cr -= result15 * alphai; - Ci += result14 * alphai; - C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; - m_top += 2; - } - - if (M & 1) { - float result0 = 0; - float result1 = 0; - float result2 = 0; - float result3 = 0; - float result4 = 0; - float result5 = 0; - float result6 = 0; - float result7 = 0; - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - - for (BLASLONG k = 0; k < K; k++) { - result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; - result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; - result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; - result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; - result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; - result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; - result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; - result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; - ai += 1 * 2; - bi += 4 * 2; - } - - BLASLONG ci = n_top * ldc + m_top; - float Cr, Ci; - Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; - Cr += result0 * alphar; - Ci += result1 * alphar; - Cr -= result1 * alphai; - Ci += result0 * alphai; - C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; - Cr += result2 * alphar; - Ci += result3 * alphar; - Cr -= result3 * alphai; - Ci += result2 * alphai; - C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; - Cr += result4 * alphar; - Ci += result5 * alphar; - Cr -= result5 * alphai; - Ci += result4 * alphai; - C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; - Cr += result6 * alphar; - Ci += result7 * alphar; - Cr -= result7 * alphai; - Ci += result6 * alphai; - C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; - m_top += 1; - } - - n_top += 4; - } - - // -- tails for N=2 - - if (N & 2) { - gvl = __riscv_vsetvl_e32m2(8); - m_top = 0; - - for (BLASLONG i = 0; i < M / 8; i += 1) { - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - float B0r = B[bi + 0 * 2 + 0]; - float B0i = B[bi + 0 * 2 + 1]; - float B1r = B[bi + 1 * 2 + 0]; - float B1i = B[bi + 1 * 2 + 1]; - bi += 2 * 2; - - vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 8 * 2; - - // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k - // leaving 10 vector registers for temporaries - vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - vfloat32m2_t ACC0r = tmp0r; - vfloat32m2_t ACC0i = tmp0i; - vfloat32m2_t ACC1r = tmp1r; - vfloat32m2_t ACC1i = tmp1i; - - for (BLASLONG k = 1; k < K; k++) { - B0r = B[bi + 0 * 2 + 0]; - B0i = B[bi + 0 * 2 + 1]; - B1r = B[bi + 1 * 2 + 0]; - B1i = B[bi + 1 * 2 + 1]; - bi += 2 * 2; - - A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 8 * 2; - - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); - ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); - ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); - ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); - } - - BLASLONG ci = n_top * ldc + m_top; - - vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - - C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); - C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); - C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); - C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); - C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); - C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); - C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); - C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); - - ci = n_top * ldc + m_top; - - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); - - m_top += 8; - } - - if (M & 4) { - gvl = __riscv_vsetvl_e32m2(4); - - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - float B0r = B[bi + 0 * 2 + 0]; - float B0i = B[bi + 0 * 2 + 1]; - float B1r = B[bi + 1 * 2 + 0]; - float B1i = B[bi + 1 * 2 + 1]; - bi += 2 * 2; - - vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 4 * 2; - - // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k - // leaving 10 vector registers for temporaries - vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - vfloat32m2_t ACC0r = tmp0r; - vfloat32m2_t ACC0i = tmp0i; - vfloat32m2_t ACC1r = tmp1r; - vfloat32m2_t ACC1i = tmp1i; - - for (BLASLONG k = 1; k < K; k++) { - B0r = B[bi + 0 * 2 + 0]; - B0i = B[bi + 0 * 2 + 1]; - B1r = B[bi + 1 * 2 + 0]; - B1i = B[bi + 1 * 2 + 1]; - bi += 2 * 2; - - A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 4 * 2; - - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); - ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); - ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); - ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); - } - - BLASLONG ci = n_top * ldc + m_top; - - vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - - C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); - C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); - C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); - C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); - C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); - C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); - C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); - C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); - - ci = n_top * ldc + m_top; - - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); - - m_top += 4; - } - - if (M & 2) { - float result0 = 0; - float result1 = 0; - float result2 = 0; - float result3 = 0; - float result4 = 0; - float result5 = 0; - float result6 = 0; - float result7 = 0; - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - - for (BLASLONG k = 0; k < K; k++) { - result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; - result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; - result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; - result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; - result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; - result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; - result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; - result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; - ai += 2 * 2; - bi += 2 * 2; - } - - BLASLONG ci = n_top * ldc + m_top; - float Cr, Ci; - Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; - Cr += result0 * alphar; - Ci += result1 * alphar; - Cr -= result1 * alphai; - Ci += result0 * alphai; - C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; - Cr += result2 * alphar; - Ci += result3 * alphar; - Cr -= result3 * alphai; - Ci += result2 * alphai; - C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; - Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; - Cr += result4 * alphar; - Ci += result5 * alphar; - Cr -= result5 * alphai; - Ci += result4 * alphai; - C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; - Cr += result6 * alphar; - Ci += result7 * alphar; - Cr -= result7 * alphai; - Ci += result6 * alphai; - C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; - m_top += 2; - } - - if (M & 1) { - float result0 = 0; - float result1 = 0; - float result2 = 0; - float result3 = 0; - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - - for (BLASLONG k = 0; k < K; k++) { - result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; - result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; - result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; - result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; - ai += 1 * 2; - bi += 2 * 2; - } - - BLASLONG ci = n_top * ldc + m_top; - float Cr, Ci; - Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; - Cr += result0 * alphar; - Ci += result1 * alphar; - Cr -= result1 * alphai; - Ci += result0 * alphai; - C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; - Cr += result2 * alphar; - Ci += result3 * alphar; - Cr -= result3 * alphai; - Ci += result2 * alphai; - C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; - m_top += 1; - } - - n_top += 2; - } - - // -- tails for N=1 - - if (N & 1) { - gvl = __riscv_vsetvl_e32m2(8); - m_top = 0; - - for (BLASLONG i = 0; i < M / 8; i += 1) { - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - float B0r = B[bi + 0 * 2 + 0]; - float B0i = B[bi + 0 * 2 + 1]; - bi += 1 * 2; - - vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 8 * 2; - - // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k - // leaving 12 vector registers for temporaries - vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - vfloat32m2_t ACC0r = tmp0r; - vfloat32m2_t ACC0i = tmp0i; - - for (BLASLONG k = 1; k < K; k++) { - B0r = B[bi + 0 * 2 + 0]; - B0i = B[bi + 0 * 2 + 1]; - bi += 1 * 2; - - A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 8 * 2; - - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); - ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); - } - - BLASLONG ci = n_top * ldc + m_top; - - vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - - C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); - C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); - C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); - C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); - - ci = n_top * ldc + m_top; - - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); - - m_top += 8; - } - - if (M & 4) { - gvl = __riscv_vsetvl_e32m2(4); - - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - float B0r = B[bi + 0 * 2 + 0]; - float B0i = B[bi + 0 * 2 + 1]; - bi += 1 * 2; - - vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 4 * 2; - - // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k - // leaving 12 vector registers for temporaries - vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - vfloat32m2_t ACC0r = tmp0r; - vfloat32m2_t ACC0i = tmp0i; - - for (BLASLONG k = 1; k < K; k++) { - B0r = B[bi + 0 * 2 + 0]; - B0i = B[bi + 0 * 2 + 1]; - bi += 1 * 2; - - A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 4 * 2; - - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); - ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); - } - - BLASLONG ci = n_top * ldc + m_top; - - vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - - C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); - C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); - C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); - C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); - - ci = n_top * ldc + m_top; - - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); - - m_top += 4; - } - - if (M & 2) { - float result0 = 0; - float result1 = 0; - float result2 = 0; - float result3 = 0; - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - - for (BLASLONG k = 0; k < K; k++) { - result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; - result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; - result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; - result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; - ai += 2 * 2; - bi += 1 * 2; - } - - BLASLONG ci = n_top * ldc + m_top; - float Cr, Ci; - Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; - Cr += result0 * alphar; - Ci += result1 * alphar; - Cr -= result1 * alphai; - Ci += result0 * alphai; - C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; - Cr += result2 * alphar; - Ci += result3 * alphar; - Cr -= result3 * alphai; - Ci += result2 * alphai; - C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; - m_top += 2; - } - - if (M & 1) { - float result0 = 0; - float result1 = 0; - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - - for (BLASLONG k = 0; k < K; k++) { - result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; - result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; - ai += 1 * 2; - bi += 1 * 2; - } - - BLASLONG ci = n_top * ldc + m_top; - float Cr, Ci; - Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; - Cr += result0 * alphar; - Ci += result1 * alphar; - Cr -= result1 * alphai; - Ci += result0 * alphai; - C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; - m_top += 1; - } - - n_top += 1; - } - - return 0; -} diff --git a/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c deleted file mode 100644 index 7980c029a4..0000000000 --- a/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c +++ /dev/null @@ -1,1931 +0,0 @@ -/* - -AUTOGENERATED KERNEL -Settings: - LMUL=1 - M=8 - M_tail_scalar_from=1 - N=8 - __riscv_='__riscv_' - complex=True - conjugate=False - cpu='zvl256b' - force_acc_double=False - index_type='BLASLONG' - op='gemm' - param_precision='float' - reg_width_bits=256 - tail_policy='' - trace=False - -Derived: - ELEN_ACC=32 - ELEN_PARAM=32 - LMUL_ACC=1 - VFMACC='__riscv_vfmacc_vf_f32m1' - VFMUL='__riscv_vfmul_vf_f32m1' - VLEV='__riscv_vle32_v_f32m1' - VLSEV='__riscv_vlse32_v_f32m1' - VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1' - VMUL_TO_ACC='__riscv_vfmul_vf_f32m1' - VSETVL='__riscv_vsetvl_e32m1' - VSEV='__riscv_vse32_v_f32m1' - VSSEV='__riscv_vsse32_v_f32m1' - acc_vector_t='vfloat32m1_t' - output='cgemm_kernel_8x8_zvl256b.c' - param_scalar_t='float' - param_vector_t='vfloat32m1_t' - -*/ - -#include "common.h" - - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define S0 1 - #define S1 -1 - #define S2 1 - #define S3 1 - #define VFMACC_RR __riscv_vfmsac - #define VFMACC_RI __riscv_vfmacc -#endif -#if defined(NR) || defined(NC) || defined(TR) || defined(TC) - #define S0 1 - #define S1 1 - #define S2 1 - #define S3 -1 - #define VFMACC_RR __riscv_vfmacc - #define VFMACC_RI __riscv_vfmsac -#endif -#if defined(RN) || defined(RT) || defined(CN) || defined(CT) - #define S0 1 - #define S1 1 - #define S2 -1 - #define S3 1 - #define VFMACC_RR __riscv_vfmacc - #define VFMACC_RI __riscv_vfnmsac -#endif -#if defined(RR) || defined(RC) || defined(CR) || defined(CC) - #define S0 1 - #define S1 -1 - #define S2 -1 - #define S3 -1 - #define VFMACC_RR __riscv_vfmsac - #define VFMACC_RI __riscv_vfnmacc -#endif - -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) - -{ - BLASLONG gvl = 0; - BLASLONG m_top = 0; - BLASLONG n_top = 0; - - - // -- MAIN PASS - - for (BLASLONG j=0; j Date: Wed, 3 Dec 2025 21:02:48 +0000 Subject: [PATCH 4/6] Allow TRMM unroll sizes to be set independently from the GEMM unroll sizes for RISC-V The unroll factor of the TRMM kernels are currently set to those of the equivalent GEMM kernels. As we are not dealing with the TRMM kernels for now, I have added extra xTRMM_UNROLL_(M|N) parameters to allow them to be set independently of the xGEMM_UNROLL_(M|N) parameters. If the new TRMM parameter is not defined, then it defaults back to the original behaviour of using the GEMM parameters. --- getarch_2nd.c | 48 +++++++++++++++++++++++++++ kernel/riscv64/KERNEL.RISCV64_ZVL128B | 46 ++++++++++++------------- kernel/riscv64/KERNEL.RISCV64_ZVL256B | 8 ++--- 3 files changed, 75 insertions(+), 27 deletions(-) diff --git a/getarch_2nd.c b/getarch_2nd.c index 2085556bd6..744ad5ee7d 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -103,7 +103,55 @@ int main(int argc, char **argv) { printf("XGEMM3M_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N); #endif +#ifdef ARCH_RISCV64 +#ifdef STRMM_DEFAULT_UNROLL_M + printf("STRMM_UNROLL_M=%d\n", STRMM_DEFAULT_UNROLL_M); +#else + printf("STRMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); +#endif + +#ifdef STRMM_DEFAULT_UNROLL_N + printf("STRMM_UNROLL_N=%d\n", STRMM_DEFAULT_UNROLL_N); +#else + printf("STRMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); +#endif + +#ifdef DTRMM_DEFAULT_UNROLL_M + printf("DTRMM_UNROLL_M=%d\n", DTRMM_DEFAULT_UNROLL_M); +#else + printf("DTRMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); +#endif + +#ifdef DTRMM_DEFAULT_UNROLL_N + printf("DTRMM_UNROLL_N=%d\n", DTRMM_DEFAULT_UNROLL_N); +#else + printf("DTRMM_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N); +#endif +#ifdef CTRMM_DEFAULT_UNROLL_M + printf("CTRMM_UNROLL_M=%d\n", CTRMM_DEFAULT_UNROLL_M); +#else + printf("CTRMM_UNROLL_M=%d\n", CGEMM_DEFAULT_UNROLL_M); +#endif + +#ifdef CTRMM_DEFAULT_UNROLL_N + printf("CTRMM_UNROLL_N=%d\n", CTRMM_DEFAULT_UNROLL_N); +#else + printf("CTRMM_UNROLL_N=%d\n", CGEMM_DEFAULT_UNROLL_N); +#endif + +#ifdef ZTRMM_DEFAULT_UNROLL_M + printf("ZTRMM_UNROLL_M=%d\n", ZTRMM_DEFAULT_UNROLL_M); +#else + printf("ZTRMM_UNROLL_M=%d\n", ZGEMM_DEFAULT_UNROLL_M); +#endif + +#ifdef ZTRMM_DEFAULT_UNROLL_N + printf("ZTRMM_UNROLL_N=%d\n", ZTRMM_DEFAULT_UNROLL_N); +#else + printf("ZTRMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N); +#endif +#endif /* ARCH_RISCV64 */ } diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B index 2f8013eeed..038d58e8f5 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL128B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B @@ -165,29 +165,29 @@ ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c -STRMMUNCOPY_M = ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c -STRMMLNCOPY_M = ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c -STRMMUTCOPY_M = ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c -STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c - -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c -DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c -DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c -DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c -DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c - -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c -CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c -CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c -CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c -CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c - -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c -ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c -ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c -ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c -ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c +STRMMKERNEL = strmm_kernel_$(STRMM_UNROLL_M)x$(STRMM_UNROLL_N)_zvl128b.c +STRMMUNCOPY_M = ../generic/trmm_uncopy_$(STRMM_UNROLL_M).c +STRMMLNCOPY_M = ../generic/trmm_lncopy_$(STRMM_UNROLL_M).c +STRMMUTCOPY_M = ../generic/trmm_utcopy_$(STRMM_UNROLL_M).c +STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(STRMM_UNROLL_M).c + +DTRMMKERNEL = dtrmm_kernel_$(DTRMM_UNROLL_M)x$(DTRMM_UNROLL_N)_zvl128b.c +DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DTRMM_UNROLL_M).c +DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DTRMM_UNROLL_M).c +DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DTRMM_UNROLL_M).c +DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DTRMM_UNROLL_M).c + +CTRMMKERNEL = ctrmm_kernel_$(CTRMM_UNROLL_M)x$(CTRMM_UNROLL_N)_zvl128b.c +CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CTRMM_UNROLL_M).c +CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CTRMM_UNROLL_M).c +CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CTRMM_UNROLL_M).c +CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CTRMM_UNROLL_M).c + +ZTRMMKERNEL = ztrmm_kernel_$(ZTRMM_UNROLL_M)x$(ZTRMM_UNROLL_N)_zvl128b.c +ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZTRMM_UNROLL_M).c +ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZTRMM_UNROLL_M).c +ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZTRMM_UNROLL_M).c +ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZTRMM_UNROLL_M).c STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index 650169d3e5..b95f7ab3e3 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -90,10 +90,10 @@ DGEMVTKERNEL = gemv_t_vector.c CGEMVTKERNEL = zgemv_t_vector.c ZGEMVTKERNEL = zgemv_t_vector.c -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c +STRMMKERNEL = strmm_kernel_$(STRMM_UNROLL_M)x$(STRMM_UNROLL_N)_zvl256b.c +DTRMMKERNEL = dtrmm_kernel_$(DTRMM_UNROLL_M)x$(DTRMM_UNROLL_N)_zvl256b.c +CTRMMKERNEL = ctrmm_kernel_$(CTRMM_UNROLL_M)x$(CTRMM_UNROLL_N)_zvl256b.c +ZTRMMKERNEL = ztrmm_kernel_$(ZTRMM_UNROLL_M)x$(ZTRMM_UNROLL_N)_zvl256b.c SGEMMKERNEL = sgemm_kernel_zvl256b.c ifneq ($(filter $(SGEMM_UNROLL_N),4 8 16),) From 38d528c7fd6279847b3f0719be1cbdfa41f8cc0c Mon Sep 17 00:00:00 2001 From: Kwok Cheung Yeung Date: Wed, 3 Dec 2025 22:06:08 +0000 Subject: [PATCH 5/6] Change DGEMM tiling size from 8x8 to 4x8 for the RISC-V ZVL256B architecture Testing has shown that 4x8 or 4x4 performs better than the original 8x8 tiling size for this kernel. As we do not wish to perturb the behaviour of the DTRMM kernel at this point, the DTRMM tiling size is explicitly set to the original 8x8. --- cmake/prebuild.cmake | 2 +- param.h | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 9320af9224..cfc8a893be 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1475,7 +1475,7 @@ endif () "#define L2_ASSOCIATIVE 4\n") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 8) - set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_M 4) set(DGEMM_UNROLL_N 8) set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_N 8) diff --git a/param.h b/param.h index 8e598d8a01..8caa997c14 100644 --- a/param.h +++ b/param.h @@ -3231,7 +3231,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 8 #define CGEMM_DEFAULT_UNROLL_M 8 @@ -3240,6 +3240,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 4 +#define DTRMM_DEFAULT_UNROLL_M 8 +#define DTRMM_DEFAULT_UNROLL_N 8 + #undef SHGEMM_DEFAULT_P #define SHGEMM_DEFAULT_P 128 #undef SBGEMM_DEFAULT_P From 9b30a9a8af43449b1d263aadfe491b5f837f94b7 Mon Sep 17 00:00:00 2001 From: Kwok Cheung Yeung Date: Fri, 5 Dec 2025 12:21:08 +0000 Subject: [PATCH 6/6] Make new RISC-V kernels work with DYNAMIC_ARCH enabled If DYNAMIC_ARCH is enabled, then the various xGEMM_UNROLL_(M|N) macros expand to 'gotoblas->', which prevent their use in macro conditionals as the value is only defined at runtime. To get around this, we use the xGEMM_UNROLL_(M|N)_DEFAULT macros instead, which should expand to a compile-time constant. As the tiling factors used in the tcopy/ncopy functions are selected by these at compile time as well, it does not result in reduced functionality as changing the tiling at runtime without changing tcopy/ncopy would result in incorrect results anyway. --- kernel/riscv64/cgemm_kernel_zvl128b.c | 4 ++-- kernel/riscv64/cgemm_kernel_zvl256b.c | 4 ++-- kernel/riscv64/dgemm_kernel_zvl128b.c | 4 ++-- kernel/riscv64/dgemm_kernel_zvl256b.c | 4 ++-- kernel/riscv64/sgemm_kernel_zvl128b.c | 4 ++-- kernel/riscv64/sgemm_kernel_zvl256b.c | 4 ++-- kernel/riscv64/zgemm_kernel_zvl128b.c | 4 ++-- kernel/riscv64/zgemm_kernel_zvl256b.c | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/riscv64/cgemm_kernel_zvl128b.c b/kernel/riscv64/cgemm_kernel_zvl128b.c index c37d365775..6e9ac386d5 100644 --- a/kernel/riscv64/cgemm_kernel_zvl128b.c +++ b/kernel/riscv64/cgemm_kernel_zvl128b.c @@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define M_BLOCKSIZE CGEMM_UNROLL_M -#define N_BLOCKSIZE CGEMM_UNROLL_N +#define M_BLOCKSIZE CGEMM_DEFAULT_UNROLL_M +#define N_BLOCKSIZE CGEMM_DEFAULT_UNROLL_N #if M_BLOCKSIZE == 4 #define RVV_MUL __riscv_vfmul_vf_f32m1 diff --git a/kernel/riscv64/cgemm_kernel_zvl256b.c b/kernel/riscv64/cgemm_kernel_zvl256b.c index 8b991bfc4f..729b99c7e2 100644 --- a/kernel/riscv64/cgemm_kernel_zvl256b.c +++ b/kernel/riscv64/cgemm_kernel_zvl256b.c @@ -28,8 +28,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define M_BLOCKSIZE CGEMM_UNROLL_M -#define N_BLOCKSIZE CGEMM_UNROLL_N +#define M_BLOCKSIZE CGEMM_DEFAULT_UNROLL_M +#define N_BLOCKSIZE CGEMM_DEFAULT_UNROLL_N #if M_BLOCKSIZE == 8 #define RVV_MUL __riscv_vfmul_vf_f32m1 diff --git a/kernel/riscv64/dgemm_kernel_zvl128b.c b/kernel/riscv64/dgemm_kernel_zvl128b.c index 45a9400063..d25f57588a 100644 --- a/kernel/riscv64/dgemm_kernel_zvl128b.c +++ b/kernel/riscv64/dgemm_kernel_zvl128b.c @@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define M_BLOCKSIZE DGEMM_UNROLL_M -#define N_BLOCKSIZE DGEMM_UNROLL_N +#define M_BLOCKSIZE DGEMM_DEFAULT_UNROLL_M +#define N_BLOCKSIZE DGEMM_DEFAULT_UNROLL_N #if M_BLOCKSIZE == 2 #define RVV_MUL __riscv_vfmul_vf_f64m1 diff --git a/kernel/riscv64/dgemm_kernel_zvl256b.c b/kernel/riscv64/dgemm_kernel_zvl256b.c index 84e66ab721..844c91e93e 100644 --- a/kernel/riscv64/dgemm_kernel_zvl256b.c +++ b/kernel/riscv64/dgemm_kernel_zvl256b.c @@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define M_BLOCKSIZE DGEMM_UNROLL_M -#define N_BLOCKSIZE DGEMM_UNROLL_N +#define M_BLOCKSIZE DGEMM_DEFAULT_UNROLL_M +#define N_BLOCKSIZE DGEMM_DEFAULT_UNROLL_N #if M_BLOCKSIZE == 4 #define RVV_MUL __riscv_vfmul_vf_f64m1 diff --git a/kernel/riscv64/sgemm_kernel_zvl128b.c b/kernel/riscv64/sgemm_kernel_zvl128b.c index 60e9d62ac0..cfd27662e9 100644 --- a/kernel/riscv64/sgemm_kernel_zvl128b.c +++ b/kernel/riscv64/sgemm_kernel_zvl128b.c @@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define M_BLOCKSIZE SGEMM_UNROLL_M -#define N_BLOCKSIZE SGEMM_UNROLL_N +#define M_BLOCKSIZE SGEMM_DEFAULT_UNROLL_M +#define N_BLOCKSIZE SGEMM_DEFAULT_UNROLL_N #if M_BLOCKSIZE == 4 #define RVV_MUL __riscv_vfmul_vf_f32m1 diff --git a/kernel/riscv64/sgemm_kernel_zvl256b.c b/kernel/riscv64/sgemm_kernel_zvl256b.c index 925e01d102..316adbbca5 100644 --- a/kernel/riscv64/sgemm_kernel_zvl256b.c +++ b/kernel/riscv64/sgemm_kernel_zvl256b.c @@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define M_BLOCKSIZE SGEMM_UNROLL_M -#define N_BLOCKSIZE SGEMM_UNROLL_N +#define M_BLOCKSIZE SGEMM_DEFAULT_UNROLL_M +#define N_BLOCKSIZE SGEMM_DEFAULT_UNROLL_N #if M_BLOCKSIZE == 8 #define RVV_MUL __riscv_vfmul_vf_f32m1 diff --git a/kernel/riscv64/zgemm_kernel_zvl128b.c b/kernel/riscv64/zgemm_kernel_zvl128b.c index ba8496e425..ad8773cc4b 100644 --- a/kernel/riscv64/zgemm_kernel_zvl128b.c +++ b/kernel/riscv64/zgemm_kernel_zvl128b.c @@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define M_BLOCKSIZE ZGEMM_UNROLL_M -#define N_BLOCKSIZE ZGEMM_UNROLL_N +#define M_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_M +#define N_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_N #if M_BLOCKSIZE == 2 #define RVV_MUL __riscv_vfmul_vf_f64m1 diff --git a/kernel/riscv64/zgemm_kernel_zvl256b.c b/kernel/riscv64/zgemm_kernel_zvl256b.c index ff32ba60a5..084bedb080 100644 --- a/kernel/riscv64/zgemm_kernel_zvl256b.c +++ b/kernel/riscv64/zgemm_kernel_zvl256b.c @@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define M_BLOCKSIZE ZGEMM_UNROLL_M -#define N_BLOCKSIZE ZGEMM_UNROLL_N +#define M_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_M +#define N_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_N #if M_BLOCKSIZE == 4 #define RVV_MUL __riscv_vfmul_vf_f64m1